QuaPy/LeQua2024/util_scripts/covariate_shift_plot.py

121 lines
3.4 KiB
Python

import os
from os.path import join
import pandas as pd
import quapy as qp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts')
print(os.getcwd())
qp.environ['SAMPLE_SIZE']=250
true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt'
domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt'
folder = '../Results_CODALAB_2024/extracted/TASK_4'
def load_result_file(path):
df = pd.read_csv(path, index_col=0)
id = df.index.to_numpy()
prevs = df.values
return id, prevs
method_files = [
#'ACC.csv',
#'CC.csv',
#'DistMatching-y.csv',
#'KDEy.csv',
#'PACC.csv',
'PCC.csv',
#'SLD.csv',
#'TeamCUFE.csv',
#'TeamGMNet.csv',
'tobiaslotz.csv'
]
method_names_nice={
'DistMatching-y': 'DM',
'TeamGMNet': 'UniOviedo(Team1)',
'tobiaslotz': 'Lamarr'
}
desired_order=[
'Lamarr',
'SLD',
'DM',
'KDEy',
'UniOviedo(Team1)'
]
desired_order=[
'PCC', 'Lamarr'
]
# load the true values (sentiment prevalence, domain prevalence)
true_id, true_prevs = load_result_file(true_prevs_path)
dom_id, dom_prevs = load_result_file(domain_prevs_path)
assert (true_id == dom_id).all(), 'unmatched files'
# define the loss for evaluation
error_name = 'RAE'
error_log = False
if error_name == 'RAE':
err_function_ = qp.error.rae
elif error_name == 'AE':
err_function_ = qp.error.ae
else:
raise ValueError()
if error_log:
error_name = f'log({error_name})'
err_function = lambda x,y: np.log(err_function_(x,y))
else:
err_function = err_function_
# load the participant and baseline results
errors = {}
for method_file in method_files:
method_name = method_file.replace('.csv', '')
id, method_prevs = load_result_file(join(folder, method_file))
print(method_file)
assert (true_id == id).all(), f'unmatched files for {method_file}'
method_error = err_function(true_prevs, method_prevs)
method_name = method_names_nice.get(method_name, method_name)
errors[method_name] = method_error
dom_A_prevs = dom_prevs[:,0]
n_bins = 5
bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1)
# Crear un DataFrame para los datos
df = pd.DataFrame({'dom_A_prevs': dom_A_prevs})
for method, err in errors.items():
df[method] = err
# Asignar cada valor de dom_A_prevs a un bin
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
# Convertir el DataFrame a formato largo
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
# Crear etiquetas de los bins para el eje X
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
# Crear el gráfico de boxplot en Seaborn
plt.figure(figsize=(14, 8))
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order)
# Configurar etiquetas del eje X con los rangos de los bins
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
plt.xlabel("Prevalence of Books")
plt.ylabel(error_name)
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
#plt.show()
plt.savefig(f'./t4_{error_name}_pcc.png')