adding scripts for plots (only local)
This commit is contained in:
parent
9d5ff154a0
commit
6f7a1e511e
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,120 @@
|
|||
import os
|
||||
from os.path import join
|
||||
import pandas as pd
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts')
|
||||
print(os.getcwd())
|
||||
|
||||
qp.environ['SAMPLE_SIZE']=250
|
||||
|
||||
true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt'
|
||||
domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt'
|
||||
folder = '../Results_CODALAB_2024/extracted/TASK_4'
|
||||
|
||||
def load_result_file(path):
|
||||
df = pd.read_csv(path, index_col=0)
|
||||
id = df.index.to_numpy()
|
||||
prevs = df.values
|
||||
return id, prevs
|
||||
|
||||
method_files = [
|
||||
#'ACC.csv',
|
||||
#'CC.csv',
|
||||
#'DistMatching-y.csv',
|
||||
#'KDEy.csv',
|
||||
#'PACC.csv',
|
||||
'PCC.csv',
|
||||
#'SLD.csv',
|
||||
#'TeamCUFE.csv',
|
||||
#'TeamGMNet.csv',
|
||||
'tobiaslotz.csv'
|
||||
]
|
||||
|
||||
method_names_nice={
|
||||
'DistMatching-y': 'DM',
|
||||
'TeamGMNet': 'UniOviedo(Team1)',
|
||||
'tobiaslotz': 'Lamarr'
|
||||
}
|
||||
|
||||
desired_order=[
|
||||
'Lamarr',
|
||||
'SLD',
|
||||
'DM',
|
||||
'KDEy',
|
||||
'UniOviedo(Team1)'
|
||||
]
|
||||
desired_order=[
|
||||
'PCC', 'Lamarr'
|
||||
]
|
||||
|
||||
# load the true values (sentiment prevalence, domain prevalence)
|
||||
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||
dom_id, dom_prevs = load_result_file(domain_prevs_path)
|
||||
assert (true_id == dom_id).all(), 'unmatched files'
|
||||
|
||||
# define the loss for evaluation
|
||||
error_name = 'RAE'
|
||||
error_log = False
|
||||
|
||||
if error_name == 'RAE':
|
||||
err_function_ = qp.error.rae
|
||||
elif error_name == 'AE':
|
||||
err_function_ = qp.error.ae
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
if error_log:
|
||||
error_name = f'log({error_name})'
|
||||
err_function = lambda x,y: np.log(err_function_(x,y))
|
||||
else:
|
||||
err_function = err_function_
|
||||
|
||||
# load the participant and baseline results
|
||||
errors = {}
|
||||
for method_file in method_files:
|
||||
method_name = method_file.replace('.csv', '')
|
||||
id, method_prevs = load_result_file(join(folder, method_file))
|
||||
print(method_file)
|
||||
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||
method_error = err_function(true_prevs, method_prevs)
|
||||
method_name = method_names_nice.get(method_name, method_name)
|
||||
errors[method_name] = method_error
|
||||
|
||||
dom_A_prevs = dom_prevs[:,0]
|
||||
|
||||
n_bins = 5
|
||||
bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1)
|
||||
|
||||
# Crear un DataFrame para los datos
|
||||
df = pd.DataFrame({'dom_A_prevs': dom_A_prevs})
|
||||
for method, err in errors.items():
|
||||
df[method] = err
|
||||
|
||||
# Asignar cada valor de dom_A_prevs a un bin
|
||||
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
|
||||
|
||||
# Convertir el DataFrame a formato largo
|
||||
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
|
||||
|
||||
# Crear etiquetas de los bins para el eje X
|
||||
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
|
||||
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
|
||||
|
||||
# Crear el gráfico de boxplot en Seaborn
|
||||
plt.figure(figsize=(14, 8))
|
||||
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order)
|
||||
|
||||
# Configurar etiquetas del eje X con los rangos de los bins
|
||||
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
|
||||
plt.xlabel("Prevalence of Books")
|
||||
plt.ylabel(error_name)
|
||||
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
|
||||
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
|
||||
plt.tight_layout()
|
||||
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||
#plt.show()
|
||||
plt.savefig(f'./t4_{error_name}_pcc.png')
|
|
@ -0,0 +1,168 @@
|
|||
import os
|
||||
from os.path import join
|
||||
import pandas as pd
|
||||
|
||||
from quapy.data.base import LabelledCollection
|
||||
import sys
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './')))
|
||||
#from LeQua2024.scripts import constants
|
||||
#from LeQua2024._lequa2024 import fetch_lequa2024
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from pathlib import Path
|
||||
import glob
|
||||
|
||||
|
||||
os.chdir('/home/moreo/QuaPy/LeQua2024')
|
||||
print(os.getcwd())
|
||||
|
||||
|
||||
qp.environ['SAMPLE_SIZE']=250
|
||||
|
||||
TASK=1
|
||||
|
||||
true_prevs_path = f'./TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
|
||||
folder = F'./Results_CODALAB_2024/extracted/TASK_{TASK}'
|
||||
|
||||
def load_result_file(path):
|
||||
df = pd.read_csv(path, index_col=0)
|
||||
id = df.index.to_numpy()
|
||||
prevs = df.values
|
||||
return id, prevs
|
||||
|
||||
|
||||
method_files = glob.glob(f"{folder}/*.csv")
|
||||
|
||||
|
||||
method_names_nice={
|
||||
'DistMatching-y': 'DM',
|
||||
'TeamGMNet': 'UniOviedo(Team1)',
|
||||
'tobiaslotz': 'Lamarr'
|
||||
}
|
||||
|
||||
exclude_methods=[
|
||||
'TeamCUFE',
|
||||
'hustav',
|
||||
'PCC',
|
||||
'CC'
|
||||
]
|
||||
|
||||
|
||||
# desired_order=[
|
||||
# 'Lamarr',
|
||||
# 'SLD',
|
||||
# 'DM',
|
||||
# 'KDEy',
|
||||
# 'UniOviedo(Team1)'
|
||||
# ]
|
||||
# desired_order=[
|
||||
# 'PCC', 'Lamarr'
|
||||
# ]
|
||||
|
||||
# load the true values (sentiment prevalence, domain prevalence)
|
||||
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||
|
||||
|
||||
# define the loss for evaluation
|
||||
error_name = 'RAE'
|
||||
error_log = False
|
||||
|
||||
if error_name == 'RAE':
|
||||
err_function_ = qp.error.rae
|
||||
elif error_name == 'AE':
|
||||
err_function_ = qp.error.ae
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
if error_log:
|
||||
error_name = f'log({error_name})'
|
||||
err_function = lambda x,y: np.log(err_function_(x,y))
|
||||
else:
|
||||
err_function = err_function_
|
||||
|
||||
|
||||
def load_vector_documents(path):
|
||||
"""
|
||||
Loads vectorized documents. In case the sample is unlabelled,
|
||||
the labels returned are None
|
||||
|
||||
:param path: path to the data sample containing the raw documents
|
||||
:return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
|
||||
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
|
||||
(250 for T1 and T4, 1000 for T2, and 200 for T3)
|
||||
"""
|
||||
D = pd.read_csv(path).to_numpy(dtype=float)
|
||||
labelled = D.shape[1] == 257
|
||||
if labelled:
|
||||
X, y = D[:,1:], D[:,0].astype(int).flatten()
|
||||
else:
|
||||
X, y = D, None
|
||||
return X, y
|
||||
|
||||
#train_prevalence = fetch_lequa2024(task=f'T{TASK}', data_home='./data')
|
||||
train = LabelledCollection.load(f'/home/moreo/QuaPy/LeQua2024/data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents)
|
||||
train_prev = train.prevalence()
|
||||
#train_prev = np.tile(train_prev, (len(true_id),1))
|
||||
|
||||
from quapy.plot import error_by_drift
|
||||
|
||||
# load the participant and baseline results
|
||||
method_names, estim_prevs = [], []
|
||||
for method_file in method_files:
|
||||
method_name = Path(method_file).name.replace('.csv', '')
|
||||
if method_name in exclude_methods:
|
||||
continue
|
||||
id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
|
||||
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||
method_name = method_names_nice.get(method_name, method_name)
|
||||
method_names.append(method_name)
|
||||
estim_prevs.append(method_prevs)
|
||||
|
||||
true_prevs = [true_prevs]*len(method_names)
|
||||
tr_prevs =[train.prevalence()]*len(method_names)
|
||||
error_by_drift(method_names,
|
||||
true_prevs,
|
||||
estim_prevs,
|
||||
tr_prevs,
|
||||
error_name='mrae', show_std=True,
|
||||
show_density=True, vlines=True, savepath=f'./util_scripts/t{TASK}_{error_name}_pcc.png')
|
||||
sys.exit()
|
||||
|
||||
shift=qp.error.ae(train_prev, true_prevs)
|
||||
|
||||
n_bins = 5
|
||||
bins = np.linspace(shift.min(), shift.max(), n_bins + 1)
|
||||
|
||||
# Crear un DataFrame para los datos
|
||||
df = pd.DataFrame({'dom_A_prevs': shift})
|
||||
for method, err in errors.items():
|
||||
df[method] = err
|
||||
|
||||
# Asignar cada valor de dom_A_prevs a un bin
|
||||
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
|
||||
|
||||
# Convertir el DataFrame a formato largo
|
||||
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
|
||||
|
||||
# Crear etiquetas de los bins para el eje X
|
||||
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
|
||||
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
|
||||
|
||||
# Crear el gráfico de boxplot en Seaborn
|
||||
plt.figure(figsize=(14, 8))
|
||||
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False)
|
||||
|
||||
# Configurar etiquetas del eje X con los rangos de los bins
|
||||
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
|
||||
plt.xlabel("Amount of PPS between the training prevalence and the test prevalences, in terms of AE ")
|
||||
plt.ylabel(error_name)
|
||||
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
|
||||
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
|
||||
plt.tight_layout()
|
||||
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||
#plt.show()
|
||||
plt.savefig(f'./util_scripts/t{TASK}_{error_name}_pcc.png')
|
Loading…
Reference in New Issue