From 10595246a9262217481bd65ec31d9c2acc1381d3 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 17 Oct 2024 12:28:24 +0200 Subject: [PATCH] cleaning last experiments for report --- KDEy/{experiments.py => _experiments_depr.py} | 0 ...ation_evaluation_debug.py => gen_plots.py} | 16 +- KDEy/gen_tables.py | 163 ++++++++++++++++++ KDEy/kdey_devel.py | 51 +++--- KDEy/quantification_evaluation.py | 10 +- result_table | 2 +- 6 files changed, 203 insertions(+), 39 deletions(-) rename KDEy/{experiments.py => _experiments_depr.py} (100%) rename KDEy/{quantification_evaluation_debug.py => gen_plots.py} (95%) create mode 100644 KDEy/gen_tables.py diff --git a/KDEy/experiments.py b/KDEy/_experiments_depr.py similarity index 100% rename from KDEy/experiments.py rename to KDEy/_experiments_depr.py diff --git a/KDEy/quantification_evaluation_debug.py b/KDEy/gen_plots.py similarity index 95% rename from KDEy/quantification_evaluation_debug.py rename to KDEy/gen_plots.py index 1f4f238..8e371f3 100644 --- a/KDEy/quantification_evaluation_debug.py +++ b/KDEy/gen_plots.py @@ -74,12 +74,11 @@ def plot(xaxis, metrics_measurements, metrics_names, suffix): plt.close() -def plot_stack(xaxis, metrics_measurements, metrics_names, suffix): +def plot_stack(xaxis, metrics_measurements, metrics_names, figname): - # Crear la figura y los ejes (4 bloques verticales) - fig, axs = plt.subplots(4, 1, figsize=(8, 12)) + n_measures = len(metrics_measurements)//2 - x = xaxis + fig, axs = plt.subplots(n_measures, 1, figsize=(8, 3*n_measures)) indexes = np.arange(len(metrics_measurements)) axs_idx = 0 @@ -105,6 +104,9 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix): # axs[axs_idx].set_title(f'{metric_te_name} and {metric_tr_name}') axs[axs_idx].legend(loc='lower right') + axs[axs_idx].set_xscale('log') + if axs_idx==0: + axs[axs_idx].set_title(dataset) if axs_idx < len(indexes)//2 -1: axs[axs_idx].set_xticks([]) @@ -120,7 +122,7 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix): # plt.show() os.makedirs('./plots/likelihood/', exist_ok=True) - plt.savefig(f'./plots/likelihood/{dataset}-fig{suffix}.png') + plt.savefig(f'./plots/likelihood/{figname}.png') plt.close() @@ -199,7 +201,7 @@ qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE show_ae = True show_rae = True show_mse = False -show_kld = True +show_kld = False normalize = True epsilon = 1e-10 @@ -259,7 +261,7 @@ for i, dataset in enumerate(tqdm(DATASETS, desc='processing datasets', total=len # measurement_names.append('NLL(te)') # measurement_names.append('NLL(tr)') # plot(xaxis, measurements, measurement_names, suffix='AVEtr') - plot_stack(xaxis, measurements, measurement_names, suffix='AVEtr') + plot_stack(xaxis, measurements, measurement_names, figname=f'{i}.png') diff --git a/KDEy/gen_tables.py b/KDEy/gen_tables.py new file mode 100644 index 0000000..269ef70 --- /dev/null +++ b/KDEy/gen_tables.py @@ -0,0 +1,163 @@ +import pickle +import os +from time import time +from collections import defaultdict + +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2 +from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from pathlib import Path + +from result_table.src.table import Table + +SEED = 1 + + +def newLR(): + return LogisticRegression(max_iter=3000) + + +# typical hyperparameters explored for Logistic Regression +logreg_grid = { + 'C': np.logspace(-4,4,9), + 'class_weight': [None, 'balanced'] +} + + +def wrap_hyper(classifier_hyper_grid: dict): + return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()} + + +METHODS = [ + # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), + # ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),/ + ('KDEy', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), + # ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), + ('KDEy-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)), + ('KDEy-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)), + ('KDEy-NLL', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='grid'), wrap_hyper(logreg_grid)), + ('KDEy-NLL+', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='optim'), wrap_hyper(logreg_grid)), + ('KDEy-AE', KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='grid'), wrap_hyper(logreg_grid)), + ('KDEy-AE+', KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='optim'), wrap_hyper(logreg_grid)), + ('KDEy-RAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='grid'), wrap_hyper(logreg_grid)), + ('KDEy-RAE+', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='optim'), wrap_hyper(logreg_grid)), +] + + +""" +TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform) +TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05) +TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1) +TKDEyML4 es como ML2 pero max 5 iteraciones por optimización +""" +TRANSDUCTIVE_METHODS = [ + #('TKDEy-ML', KDEyMLauto(newLR()), None), + # ('TKDEy-both', KDEyMLauto(newLR(), optim='both'), None), + # ('TKDEy-bothfine', KDEyMLauto(newLR(), optim='both_fine'), None), + # ('TKDEy-two', KDEyMLauto(newLR(), optim='two_steps'), None), + # ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None), + # ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None), + #('TKDEy-ML3', KDEyMLauto(newLR()), None), + #('TKDEy-ML4', KDEyMLauto(newLR()), None), +] + +def show_results(result_path, tables, tables_path='./tables/main.pdf'): + import pandas as pd + df = pd.read_csv(result_path + '.csv', sep='\t') + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pd.set_option('display.width', 1000) # Ajustar el ancho máximo + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True) + print(pv) + + os.makedirs(Path(tables_path).parent, exist_ok=True) + tables= [table for table in tables.values()] + + method_replace = { + 'KDEy': 'KDEy(orig)', + 'KDEy-scott': 'Scott', + 'KDEy-silver': 'Silver', + 'KDEy-NLL': 'NLL(grid)', + 'KDEy-NLL+': 'NLL(search)', + 'KDEy-AE': 'AE(grid)', + 'KDEy-AE+': 'AE(search)', + 'KDEy-RAE': 'RAE(grid)', + 'KDEy-RAE+': 'RAE(search)', + } + + Table.LatexPDF(tables_path, tables, method_replace=method_replace, verbose=True, clean=False) + + +def collect_results(method_name, tables): + + print('Init method', method_name) + + with open(global_result_path + '.csv', 'at') as csv: + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + print('init', dataset) + + # run_experiment(global_result_path, method_name, quantifier, param_grid, dataset) + local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe') + + if os.path.exists(local_result_path): + print(f'result file {local_result_path} already exist; skipping') + report = qp.util.load_report(local_result_path) + for metric, table in tables.items(): + add_column = metric in ['tr_time', 'te_time'] + if not add_column: + add_column = (metric=='mrae' and '-AE' not in method_name) or (metric=='mae' and '-RAE' not in method_name) + if add_column: + tables[metric].add(benchmark=dataset, method=method_name, v=report[metric]) + # tables['mrae'].add(benchmark=dataset, method=method_name, v=report['mrae']) + + else: + continue + + means = report.mean(numeric_only=True) + csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n') + csv.flush() + + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 100 + n_bags_test = 500 + result_dir = f'results_quantification/ucimulti' + + os.makedirs(result_dir, exist_ok=True) + + tables = { + 'mae': Table('inductive-mae'), + 'mrae': Table('inductive-mrae'), + 'tr_time': Table('inductive-tr-time'), + # 'te_time': Table('inductive-te-time'), + } + + tables['tr_time'].format.show_std = False + # tables['te_time'].format.show_std = False + + + global_result_path = f'{result_dir}/allmethods' + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n') + + for method_name, _, _ in METHODS + TRANSDUCTIVE_METHODS: + collect_results(method_name, tables) + + show_results(global_result_path, tables) diff --git a/KDEy/kdey_devel.py b/KDEy/kdey_devel.py index c7fc237..379d56b 100644 --- a/KDEy/kdey_devel.py +++ b/KDEy/kdey_devel.py @@ -40,7 +40,7 @@ class KDEyMLauto(KDEyML): current_bandwidth = 0.05 if self.optim == 'both_fine': current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,)) - current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + current_prevalence = F.uniform_prevalence(n_classes=n_classes) if self.optim == 'max_likelihood': current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True) @@ -107,9 +107,9 @@ class KDEyMLauto(KDEyML): # bounds = [(0.00001, 0.2)] # r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds) - r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.00001, 0.2)) + r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005}) # print(f'iterations-bandwidth={r.nit}') - assert r.success, f'Process did not converge! {r.message}' + # assert r.success, f'Process did not converge! {r.message}' return r.x def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): @@ -128,7 +128,7 @@ class KDEyMLauto(KDEyML): prevalence_bandwidth = np.append(current_prev, current_bandwidth) r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) print(f'iterations-both={r.nit}') - assert r.success, 'Process did not converge!' + # assert r.success, 'Process did not converge!' prev_band = r.x current_prevalence = prev_band[:-1] current_bandwidth = prev_band[-1] @@ -145,12 +145,12 @@ class KDEyMLauto(KDEyML): test_loglikelihood = np.log(test_mixture_likelihood + epsilon) return -np.sum(test_loglikelihood) - bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)] + bounds = [(0, 1) for _ in range(n_classes)] + [(0.0001, 0.2) for _ in range(n_classes)] constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])}) prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth)) r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) print(f'iterations-both-fine={r.nit}') - assert r.success, 'Process did not converge!' + # assert r.success, 'Process did not converge!' prev_band = r.x current_prevalence = prev_band[:n_classes] current_bandwidth = prev_band[n_classes:] @@ -198,7 +198,7 @@ class KDEyMLauto(KDEyML): best_like = None best_prev = None init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - for bandwidth in np.logspace(-4, 0.5, 50): + for bandwidth in np.logspace(-4, np.log10(0.2), 50): mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] @@ -239,7 +239,7 @@ class KDEyMLauto(KDEyML): r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds) best_band = r.x[0] - assert r.success, 'Process did not converge!' + # assert r.success, 'Process did not converge!' print(f'solved in nit={r.nit}') return best_band @@ -333,11 +333,10 @@ class KDEyMLauto2(KDEyML): return loss_accum if self.search == 'optim': - r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.001, 0.2), options={'xatol': 0.005}) + r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005}) best_band = r.x best_loss_value = r.fun nit = r.nit - # assert r.success, 'Process did not converge!' elif self.search=='grid': nit=20 @@ -348,20 +347,20 @@ class KDEyMLauto2(KDEyML): self.bandwidth_ = best_band -class KDEyMLred(KDEyML): - def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500): - self.classifier = qp._get_classifier(classifier) - self.val_split = val_split - self.bandwidth = KDEBase._check_bandwidth(bandwidth) - self.reduction = reduction - self.max_reduced = max_reduced - self.random_state = random_state - - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): - n_classes = classif_predictions.n_classes - tr_length = min(self.reduction * n_classes, self.max_reduced) - if len(classif_predictions) > tr_length: - classif_predictions = classif_predictions.sampling(tr_length) - self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) - return self +# class KDEyMLred(KDEyML): +# def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500): +# self.classifier = qp._get_classifier(classifier) +# self.val_split = val_split +# self.bandwidth = KDEBase._check_bandwidth(bandwidth) +# self.reduction = reduction +# self.max_reduced = max_reduced +# self.random_state = random_state +# +# def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): +# n_classes = classif_predictions.n_classes +# tr_length = min(self.reduction * n_classes, self.max_reduced) +# if len(classif_predictions) > tr_length: +# classif_predictions = classif_predictions.sampling(tr_length) +# self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) +# return self diff --git a/KDEy/quantification_evaluation.py b/KDEy/quantification_evaluation.py index 7591783..f9245f1 100644 --- a/KDEy/quantification_evaluation.py +++ b/KDEy/quantification_evaluation.py @@ -7,7 +7,7 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred +from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2 from quapy.method.aggregative import PACC, EMQ, KDEyML from quapy.model_selection import GridSearchQ from quapy.protocol import UPP @@ -32,7 +32,7 @@ def wrap_hyper(classifier_hyper_grid: dict): METHODS = [ - ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), + # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), ('KDEy', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), # ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), @@ -55,9 +55,9 @@ TKDEyML4 es como ML2 pero max 5 iteraciones por optimización """ TRANSDUCTIVE_METHODS = [ #('TKDEy-ML', KDEyMLauto(newLR()), None), - # ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None), - # ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None), - # ('TKDEy-ML2', KDEyMLauto(newLR(), optim='two_steps'), None), + ('TKDEy-both', KDEyMLauto(newLR(), optim='both'), None), + ('TKDEy-bothfine', KDEyMLauto(newLR(), optim='both_fine'), None), + ('TKDEy-two', KDEyMLauto(newLR(), optim='two_steps'), None), # ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None), # ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None), #('TKDEy-ML3', KDEyMLauto(newLR()), None), diff --git a/result_table b/result_table index c223c9f..52547b2 160000 --- a/result_table +++ b/result_table @@ -1 +1 @@ -Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4 +Subproject commit 52547b253e906b8ae8d5ae3df77dafe72fac6902