From 5bbaf42a0ede62cb5ab0e4a05843b5c43067038e Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 13 Nov 2024 18:45:26 +0100 Subject: [PATCH] improving plots for overview paper --- LeQua2024/util_scripts/prior_shift_plot.py | 101 ++++++--------------- 1 file changed, 27 insertions(+), 74 deletions(-) diff --git a/LeQua2024/util_scripts/prior_shift_plot.py b/LeQua2024/util_scripts/prior_shift_plot.py index d3e3c8c..0f06f57 100644 --- a/LeQua2024/util_scripts/prior_shift_plot.py +++ b/LeQua2024/util_scripts/prior_shift_plot.py @@ -2,31 +2,31 @@ import os from os.path import join import pandas as pd +from scripts.data import load_vector_documents from quapy.data.base import LabelledCollection import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './'))) +# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) +# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './'))) #from LeQua2024.scripts import constants #from LeQua2024._lequa2024 import fetch_lequa2024 import quapy as qp import numpy as np import matplotlib.pyplot as plt -import seaborn as sns +# import seaborn as sns from pathlib import Path import glob +from scripts.constants import SAMPLE_SIZE -os.chdir('/home/moreo/QuaPy/LeQua2024') -print(os.getcwd()) +# os.chdir('/home/moreo/QuaPy/LeQua2024') +# print(os.getcwd()) +TASK=2 +qp.environ['SAMPLE_SIZE']=SAMPLE_SIZE[f'T{TASK}'] -qp.environ['SAMPLE_SIZE']=250 - -TASK=1 - -true_prevs_path = f'./TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt' -folder = F'./Results_CODALAB_2024/extracted/TASK_{TASK}' +true_prevs_path = f'../TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt' +folder = F'../Results_CODALAB_2024/extracted/TASK_{TASK}' def load_result_file(path): df = pd.read_csv(path, index_col=0) @@ -85,30 +85,12 @@ else: err_function = err_function_ -def load_vector_documents(path): - """ - Loads vectorized documents. In case the sample is unlabelled, - the labels returned are None - - :param path: path to the data sample containing the raw documents - :return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if - the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample - (250 for T1 and T4, 1000 for T2, and 200 for T3) - """ - D = pd.read_csv(path).to_numpy(dtype=float) - labelled = D.shape[1] == 257 - if labelled: - X, y = D[:,1:], D[:,0].astype(int).flatten() - else: - X, y = D, None - return X, y - #train_prevalence = fetch_lequa2024(task=f'T{TASK}', data_home='./data') -train = LabelledCollection.load(f'/home/moreo/QuaPy/LeQua2024/data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents) +train = LabelledCollection.load(f'../data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents) train_prev = train.prevalence() #train_prev = np.tile(train_prev, (len(true_id),1)) -from quapy.plot import error_by_drift +from quapy.plot import error_by_drift, binary_diagonal # load the participant and baseline results method_names, estim_prevs = [], [] @@ -123,46 +105,17 @@ for method_file in method_files: estim_prevs.append(method_prevs) true_prevs = [true_prevs]*len(method_names) +savepath = f'./t{TASK}_diagonal.png' +if TASK==1: + binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, + train_prev=train.prevalence(), savepath=savepath, method_order=None) + + tr_prevs =[train.prevalence()]*len(method_names) -error_by_drift(method_names, - true_prevs, - estim_prevs, - tr_prevs, - error_name='mrae', show_std=True, - show_density=True, vlines=True, savepath=f'./util_scripts/t{TASK}_{error_name}_pcc.png') -sys.exit() - -shift=qp.error.ae(train_prev, true_prevs) - -n_bins = 5 -bins = np.linspace(shift.min(), shift.max(), n_bins + 1) - -# Crear un DataFrame para los datos -df = pd.DataFrame({'dom_A_prevs': shift}) -for method, err in errors.items(): - df[method] = err - -# Asignar cada valor de dom_A_prevs a un bin -df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True) - -# Convertir el DataFrame a formato largo -df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error') - -# Crear etiquetas de los bins para el eje X -bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)] -df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels))) - -# Crear el gráfico de boxplot en Seaborn -plt.figure(figsize=(14, 8)) -sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False) - -# Configurar etiquetas del eje X con los rangos de los bins -plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0) -plt.xlabel("Amount of PPS between the training prevalence and the test prevalences, in terms of AE ") -plt.ylabel(error_name) -#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs") -plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) -plt.tight_layout() -plt.grid(True, which='both', linestyle='--', linewidth=0.5) -#plt.show() -plt.savefig(f'./util_scripts/t{TASK}_{error_name}_pcc.png') +savepath = f'./t{TASK}_{error_name}_pps.png' +error_by_drift(method_names, + true_prevs, + estim_prevs, + tr_prevs, title=None, + error_name='rae', show_std=True, n_bins=1000, + show_density=True, vlines=[tr_prevs[0][1]], savepath=savepath)