import os from os.path import join import pandas as pd import quapy as qp import numpy as np import matplotlib.pyplot as plt import seaborn as sns os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts') print(os.getcwd()) qp.environ['SAMPLE_SIZE']=250 true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt' domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt' folder = '../Results_CODALAB_2024/extracted/TASK_4' def load_result_file(path): df = pd.read_csv(path, index_col=0) id = df.index.to_numpy() prevs = df.values return id, prevs method_files = [ #'ACC.csv', #'CC.csv', #'DistMatching-y.csv', #'KDEy.csv', #'PACC.csv', 'PCC.csv', #'SLD.csv', #'TeamCUFE.csv', #'TeamGMNet.csv', 'tobiaslotz.csv' ] method_names_nice={ 'DistMatching-y': 'DM', 'TeamGMNet': 'UniOviedo(Team1)', 'tobiaslotz': 'Lamarr' } desired_order=[ 'Lamarr', 'SLD', 'DM', 'KDEy', 'UniOviedo(Team1)' ] desired_order=[ 'PCC', 'Lamarr' ] # load the true values (sentiment prevalence, domain prevalence) true_id, true_prevs = load_result_file(true_prevs_path) dom_id, dom_prevs = load_result_file(domain_prevs_path) assert (true_id == dom_id).all(), 'unmatched files' # define the loss for evaluation error_name = 'RAE' error_log = False if error_name == 'RAE': err_function_ = qp.error.rae elif error_name == 'AE': err_function_ = qp.error.ae else: raise ValueError() if error_log: error_name = f'log({error_name})' err_function = lambda x,y: np.log(err_function_(x,y)) else: err_function = err_function_ # load the participant and baseline results errors = {} for method_file in method_files: method_name = method_file.replace('.csv', '') id, method_prevs = load_result_file(join(folder, method_file)) print(method_file) assert (true_id == id).all(), f'unmatched files for {method_file}' method_error = err_function(true_prevs, method_prevs) method_name = method_names_nice.get(method_name, method_name) errors[method_name] = method_error dom_A_prevs = dom_prevs[:,0] n_bins = 5 bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1) # Crear un DataFrame para los datos df = pd.DataFrame({'dom_A_prevs': dom_A_prevs}) for method, err in errors.items(): df[method] = err # Asignar cada valor de dom_A_prevs a un bin df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True) # Convertir el DataFrame a formato largo df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error') # Crear etiquetas de los bins para el eje X bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)] df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels))) # Crear el gráfico de boxplot en Seaborn plt.figure(figsize=(14, 8)) sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order) # Configurar etiquetas del eje X con los rangos de los bins plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0) plt.xlabel("Prevalence of Books") plt.ylabel(error_name) #plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs") plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) plt.tight_layout() plt.grid(True, which='both', linestyle='--', linewidth=0.5) #plt.show() plt.savefig(f'./t4_{error_name}_pcc.png')