import pickle from collections import defaultdict import matplotlib.pyplot as plt import seaborn as sns from joblib import Parallel, delayed from tqdm import tqdm import pandas as pd from glob import glob from pathlib import Path import quapy as qp from BayesianKDEy.commons import RESULT_DIR from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, VisualDataHandler, CIFAR100Handler from comparison_group import SelectGreaterThan, SelectByName, SelectSmallerThan from format import FormatModifierSelectColor from quapy.error import dist_aitchison from quapy.method.confidence import ConfidenceIntervals, ConfidenceIntervalsILR, ConfidenceIntervalsCLR from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC import quapy.functional as F from result_path.src.table import LatexTable import numpy as np import pandas as pd from itertools import chain pd.set_option('display.max_columns', None) pd.set_option('display.width', 2000) pd.set_option('display.max_rows', None) pd.set_option("display.expand_frame_repr", False) pd.set_option("display.precision", 4) pd.set_option("display.float_format", "{:.4f}".format) # methods = None # show all methods methods = ['BoCC', 'BaACC!', 'BaEMQ!', 'BaKDE-Gau-T!', 'BaKDE-Ait-T!', 'BaKDE-Ait-T!2' #'BootstrapACC', #'BootstrapHDy', #'BootstrapKDEy', #'BootstrapEMQ' ] def region_score(true_prev, region: ConfidenceRegionABC): amp = region.montecarlo_proportion(50_000) if true_prev in region: cost = 0 else: scale_cost = 1/region.alpha cost = scale_cost * dist_aitchison(true_prev, region.closest_point_in_region(true_prev)) return amp + cost def compute_coverage_amplitude(region_constructor, **kwargs): all_samples = results['samples'] all_true_prevs = results['true-prevs'] def process_one(samples, true_prevs): region = region_constructor(samples, **kwargs) if isinstance(region, ConfidenceIntervals) or isinstance(region, ConfidenceIntervalsCLR) or isinstance(region, ConfidenceIntervalsILR): winkler = region.mean_winkler_score(true_prevs) # winkler_e = region.mean_winkler_score(true_prevs, add_ae=True) cov_soft = region.coverage_soft(true_prevs) else: winkler = None # winkler_e = None cov_soft = None return region.coverage(true_prevs), region.montecarlo_proportion(), winkler, cov_soft out = Parallel(n_jobs=3)( delayed(process_one)(samples, true_prevs) for samples, true_prevs in tqdm( zip(all_samples, all_true_prevs), total=len(all_samples), desc='constructing ellipses' ) ) # unzip results coverage, amplitude, winkler, cov_soft = zip(*out) return list(coverage), list(amplitude), list(winkler), list(cov_soft) def update_pickle(report, pickle_path, updated_dict:dict): for k,v in updated_dict.items(): report[k]=v pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwargs): if f'coverage-{conf_name}' not in report: covs, amps, winkler, cov_soft = compute_coverage_amplitude(conf_region_class, **kwargs) update_fields = { f'coverage-{conf_name}': covs, f'amplitude-{conf_name}': amps, f'winkler-{conf_name}': winkler, f'coverage-soft-{conf_name}': cov_soft } update_pickle(report, file, update_fields) def pareto_front(df, x_col, y_col, maximize_y=True, minimize_x=True): """ Returns a boolean mask indicating whether each row is Pareto-optimal. """ X = df[x_col].values Y = df[y_col].values is_pareto = np.ones(len(df), dtype=bool) for i in range(len(df)): if not is_pareto[i]: continue for j in range(len(df)): if i == j: continue better_or_equal_x = X[j] <= X[i] if minimize_x else X[j] >= X[i] better_or_equal_y = Y[j] >= Y[i] if maximize_y else Y[j] <= Y[i] strictly_better = ( (X[j] < X[i] if minimize_x else X[j] > X[i]) or (Y[j] > Y[i] if maximize_y else Y[j] < Y[i]) ) if better_or_equal_x and better_or_equal_y and strictly_better: is_pareto[i] = False break return is_pareto def plot_coverage_vs_amplitude( df, coverage_col, amplitude_col="a-CI", method_col="method", dataset_col=None, error_col=None, error_threshold=None, nominal_coverage=0.95, title=None, ): df_plot = df.copy() # Optional error filtering if error_col is not None and error_threshold is not None: df_plot = df_plot[df_plot[error_col] <= error_threshold] # Compute Pareto front pareto_mask = pareto_front( df_plot, x_col=amplitude_col, y_col=coverage_col, maximize_y=True, minimize_x=True ) plt.figure(figsize=(7, 6)) # Base scatter sns.scatterplot( data=df_plot, x=amplitude_col, y=coverage_col, hue=method_col, # style=dataset_col, alpha=0.6, s=60, legend=True ) # Highlight Pareto front plt.scatter( df_plot.loc[pareto_mask, amplitude_col], df_plot.loc[pareto_mask, coverage_col], facecolors='none', edgecolors='black', s=120, linewidths=1.5, label="Pareto front" ) # Nominal coverage line plt.axhline( nominal_coverage, linestyle="--", color="gray", linewidth=1, label="Nominal coverage" ) plt.xlabel("Amplitude (fraction of simplex)") plt.ylabel("Coverage") plt.ylim(0, 1.05) if title is not None: plt.title(title) plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") plt.tight_layout() plt.show() def nicer_method(name:str): replacements = { # 'Bayesian': 'Ba', 'Bootstrap': 'Bo', '-numpyro': '', 'emcee': 'emc', '-T*': '*', '-T!': '', '!': '', '-Ait': r'$^{(\mathrm{Ait})}$', '-Gau': r'$^{(\mathrm{Gau})}$' } for k, v in replacements.items(): name = name.replace(k,v) return name def nicer_data(name:str): replacements = { 'cifar': 'CIFAR', '-l': '', 'mnist': 'MNIST', 'fashionmnist': 'fashionMNIST', 'svhn': 'SVHN', '100coarse': '100(20)', } for k, v in replacements.items(): name = name.replace(k, v) return name base_dir = RESULT_DIR table = defaultdict(list) n_classes = {} tr_size = {} tr_prev = {} dataset_class = [UCIMulticlassHandler, CIFAR100Handler, VisualDataHandler, LeQuaHandler] dataset_order = [] for handler in dataset_class: for dataset in handler.iter(): dataset_order.append(dataset.name) train = dataset.get_training() n_classes[dataset.name] = train.n_classes tr_size[dataset.name] = len(train) tr_prev[dataset.name] = F.strprev(train.prevalence()) problem_type = 'multiclass' path = f'./{base_dir}/{problem_type}/*.pkl' for file in tqdm(glob(path), desc='processing results', total=len(glob(path))): file = Path(file) dataset, method = file.name.replace('.pkl', '').split('__') if (method not in methods) or (dataset not in dataset_order): continue report = pickle.load(open(file, 'rb')) results = report['results'] n_samples = len(results['ae']) table['method'].extend([nicer_method(method)] * n_samples) table['dataset'].extend([nicer_data(dataset)] * n_samples) table['ae'].extend(results['ae']) table['rae'].extend(results['rae']) # table['c-CI'].extend(results['coverage']) # table['a-CI'].extend(results['amplitude']) # update_pickle_with_region(report, file, conf_name='CI-ILR', conf_region_class=ConfidenceIntervalsILR, bonferroni_correction=True) # update_pickle_with_region(report, file, conf_name='CI-CLR', conf_region_class=ConfidenceIntervalsCLR, bonferroni_correction=True) update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True) update_pickle_with_region(report, file, conf_name='CInb', conf_region_class=ConfidenceIntervals, bonferroni_correction=False) # no Bonferroni-correction # update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex) # update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR) # update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR) conf_bonferroni = 'CI' conf_name='CInb' table['c-CI'].extend(report[f'coverage-{conf_bonferroni}']) # the true coverage is better measured with Bonferroni-correction table['w-CI'].extend(report[f'winkler-{conf_name}']) table['cs-CI'].extend(report[f'coverage-soft-{conf_name}']) table['a-CI'].extend(report[f'amplitude-{conf_name}']) # table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim'])) # not in this paper... table['SRE'].extend(qp.error.sre(results['true-prevs'], results['point-estim'], report['train-prev'], eps=0.001)) # remove datasets with more than max_classes classes # max_classes = 25 # min_train = 500 # ignore_datasets = ['poker_hand', 'hcv'] # for data_name, n in n_classes.items(): # if n > max_classes: # df = df[df["dataset"] != data_name] # for data_name, n in tr_size.items(): # if n < min_train: # df = df[df["dataset"] != data_name] # for data_name, n in tr_size.items(): # if data_name in ignore_datasets: # df = df[df["dataset"] != data_name] df = pd.DataFrame(table) df['a-CI'] *= 100 df['c-CI'] *= 100 df['cs-CI'] *= 100 for region in ['CI']: #, 'CLR', 'ILR', 'CI']: if problem_type == 'binary' and region=='ILR': continue for column in [f'a-{region}', 'ae', 'SRE', f'c-{region}', f'cs-{region}']: # f'w-{region}' pv = pd.pivot_table( df, index='dataset', columns='method', values=column, margins=True ) pv['n_classes'] = pv.index.map(n_classes).astype('Int64') pv['tr_size'] = pv.index.map(tr_size).astype('Int64') #pv['tr-prev'] = pv.index.map(tr_prev) pv = pv.drop(columns=[col for col in pv.columns if col == "All" or col[-1]=='All']) print(f'{problem_type=} {column=}') print(pv) print('-'*80) latex = LatexTable.from_dataframe(df, method='method', benchmark='dataset', value=column, name=column) latex.format.configuration.show_std = False #latex.reorder_methods([nicer_method(m) for m in methods]) latex.reorder_benchmarks([nicer_data(d) for d in dataset_order]) if column in ['ae', 'SRE']: latex.format.configuration.lower_is_better = True latex.format.configuration.stat_test = 'wilcoxon' #latex.format.configuration.stat_test = None # latex.format.configuration.show_std = True if column in [f'c-{region}', f'cs-{region}']: latex.format.configuration.lower_is_better = False latex.format.configuration.stat_test = None latex.format.configuration.with_color = False latex.format.configuration.best_in_bold = False latex.format.configuration.with_rank = False latex.format.configuration.mean_prec = 0 latex.add_format_modifier( format_modifier=FormatModifierSelectColor( comparison=SelectGreaterThan(reference_selector=89, input_selector=SelectByName()) ) ) if column in [f'a-{region}']: latex.format.configuration.lower_is_better = True latex.format.configuration.stat_test = None latex.format.configuration.with_color = False latex.format.configuration.best_in_bold = False latex.format.configuration.mean_prec = 2 latex.add_format_modifier( format_modifier=FormatModifierSelectColor( comparison=SelectSmallerThan(reference_selector=11, input_selector=SelectByName()) ) ) # latex.add_format_modifier( # format_modifier=FormatModifierSelectColor( # comparison=SelectSmallerThan(reference_selector=0.01, input_selector=SelectByName()), # intensity=50 # ) # ) latex.format.configuration.resizebox=.5 latex.latexPDF(pdf_path=f'./tables/{latex.name}.pdf') df = df[df['method']!='BaACC'] df = df[df['method']!='BaACC*'] df = df[df['method']!='BaACC+'] df = df[df['method']!='BaKDE-Ait*'] df = df[df['method']!='BaKDE-Gau*'] df = df[df['method']!='BayEMQ*'] grouped = df.groupby(["method", "dataset"]) agg = grouped.agg( ae_mean=("ae", "mean"), ae_std=("ae", "std"), sre_mean=("SRE", "mean"), sre_std=("SRE", "std"), coverage_mean=("c-CI", "mean"), coverage_std=("c-CI", "std"), coverage_soft_mean=("cs-CI", "mean"), amplitude_mean=("a-CI", "mean"), amplitude_std=("a-CI", "std"), ).reset_index() #plot_coverage_vs_amplitude( # agg, # coverage_col="coverage_soft_mean", # amplitude_col="amplitude_mean", # method_col="method", # dataset_col="dataset", # nominal_coverage=0.95, # title="Marginal coverage vs amplitude" #) #print('RESTITUIR EL WILCOXON')