QuaPy/BayesianKDEy/generate_results.py

408 lines
14 KiB
Python

import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd
from glob import glob
from pathlib import Path
import quapy as qp
from BayesianKDEy.commons import RESULT_DIR
from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, VisualDataHandler, CIFAR100Handler
from comparison_group import SelectGreaterThan, SelectByName, SelectSmallerThan
from format import FormatModifierSelectColor
from quapy.error import dist_aitchison
from quapy.method.confidence import ConfidenceIntervals, ConfidenceIntervalsILR, ConfidenceIntervalsCLR
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
import quapy.functional as F
from result_path.src.table import LatexTable
import numpy as np
import pandas as pd
from itertools import chain
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
# methods = None # show all methods
methods = ['BoCC',
'BaACC!',
'BaEMQ!',
'BaKDE-Gau-T!',
'BaKDE-Ait-T!',
'BaKDE-Ait-T!2'
#'BootstrapACC',
#'BootstrapHDy',
#'BootstrapKDEy',
#'BootstrapEMQ'
]
def region_score(true_prev, region: ConfidenceRegionABC):
amp = region.montecarlo_proportion(50_000)
if true_prev in region:
cost = 0
else:
scale_cost = 1/region.alpha
cost = scale_cost * dist_aitchison(true_prev, region.closest_point_in_region(true_prev))
return amp + cost
def compute_coverage_amplitude(region_constructor, **kwargs):
all_samples = results['samples']
all_true_prevs = results['true-prevs']
def process_one(samples, true_prevs):
region = region_constructor(samples, **kwargs)
if isinstance(region, ConfidenceIntervals) or isinstance(region, ConfidenceIntervalsCLR) or isinstance(region, ConfidenceIntervalsILR):
winkler = region.mean_winkler_score(true_prevs)
# winkler_e = region.mean_winkler_score(true_prevs, add_ae=True)
cov_soft = region.coverage_soft(true_prevs)
else:
winkler = None
# winkler_e = None
cov_soft = None
return region.coverage(true_prevs), region.montecarlo_proportion(), winkler, cov_soft
out = Parallel(n_jobs=3)(
delayed(process_one)(samples, true_prevs)
for samples, true_prevs in tqdm(
zip(all_samples, all_true_prevs),
total=len(all_samples),
desc='constructing ellipses'
)
)
# unzip results
coverage, amplitude, winkler, cov_soft = zip(*out)
return list(coverage), list(amplitude), list(winkler), list(cov_soft)
def update_pickle(report, pickle_path, updated_dict:dict):
for k,v in updated_dict.items():
report[k]=v
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwargs):
if f'coverage-{conf_name}' not in report:
covs, amps, winkler, cov_soft = compute_coverage_amplitude(conf_region_class, **kwargs)
update_fields = {
f'coverage-{conf_name}': covs,
f'amplitude-{conf_name}': amps,
f'winkler-{conf_name}': winkler,
f'coverage-soft-{conf_name}': cov_soft
}
update_pickle(report, file, update_fields)
def pareto_front(df, x_col, y_col, maximize_y=True, minimize_x=True):
"""
Returns a boolean mask indicating whether each row is Pareto-optimal.
"""
X = df[x_col].values
Y = df[y_col].values
is_pareto = np.ones(len(df), dtype=bool)
for i in range(len(df)):
if not is_pareto[i]:
continue
for j in range(len(df)):
if i == j:
continue
better_or_equal_x = X[j] <= X[i] if minimize_x else X[j] >= X[i]
better_or_equal_y = Y[j] >= Y[i] if maximize_y else Y[j] <= Y[i]
strictly_better = (
(X[j] < X[i] if minimize_x else X[j] > X[i]) or
(Y[j] > Y[i] if maximize_y else Y[j] < Y[i])
)
if better_or_equal_x and better_or_equal_y and strictly_better:
is_pareto[i] = False
break
return is_pareto
def plot_coverage_vs_amplitude(
df,
coverage_col,
amplitude_col="a-CI",
method_col="method",
dataset_col=None,
error_col=None,
error_threshold=None,
nominal_coverage=0.95,
title=None,
):
df_plot = df.copy()
# Optional error filtering
if error_col is not None and error_threshold is not None:
df_plot = df_plot[df_plot[error_col] <= error_threshold]
# Compute Pareto front
pareto_mask = pareto_front(
df_plot,
x_col=amplitude_col,
y_col=coverage_col,
maximize_y=True,
minimize_x=True
)
plt.figure(figsize=(7, 6))
# Base scatter
sns.scatterplot(
data=df_plot,
x=amplitude_col,
y=coverage_col,
hue=method_col,
# style=dataset_col,
alpha=0.6,
s=60,
legend=True
)
# Highlight Pareto front
plt.scatter(
df_plot.loc[pareto_mask, amplitude_col],
df_plot.loc[pareto_mask, coverage_col],
facecolors='none',
edgecolors='black',
s=120,
linewidths=1.5,
label="Pareto front"
)
# Nominal coverage line
plt.axhline(
nominal_coverage,
linestyle="--",
color="gray",
linewidth=1,
label="Nominal coverage"
)
plt.xlabel("Amplitude (fraction of simplex)")
plt.ylabel("Coverage")
plt.ylim(0, 1.05)
if title is not None:
plt.title(title)
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
def nicer_method(name:str):
replacements = {
# 'Bayesian': 'Ba',
'Bootstrap': 'Bo',
'-numpyro': '',
'emcee': 'emc',
'-T*': '*',
'-T!': '',
'!': '',
'-Ait': r'$^{(\mathrm{Ait})}$',
'-Gau': r'$^{(\mathrm{Gau})}$'
}
for k, v in replacements.items():
name = name.replace(k,v)
return name
def nicer_data(name:str):
replacements = {
'cifar': 'CIFAR',
'-l': '',
'mnist': 'MNIST',
'fashionmnist': 'fashionMNIST',
'svhn': 'SVHN',
'100coarse': '100(20)',
}
for k, v in replacements.items():
name = name.replace(k, v)
return name
base_dir = RESULT_DIR
table = defaultdict(list)
n_classes = {}
tr_size = {}
tr_prev = {}
dataset_class = [UCIMulticlassHandler, CIFAR100Handler, VisualDataHandler, LeQuaHandler]
dataset_order = []
for handler in dataset_class:
for dataset in handler.iter():
dataset_order.append(dataset.name)
train = dataset.get_training()
n_classes[dataset.name] = train.n_classes
tr_size[dataset.name] = len(train)
tr_prev[dataset.name] = F.strprev(train.prevalence())
problem_type = 'multiclass'
path = f'./{base_dir}/{problem_type}/*.pkl'
for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
file = Path(file)
dataset, method = file.name.replace('.pkl', '').split('__')
if (method not in methods) or (dataset not in dataset_order):
continue
report = pickle.load(open(file, 'rb'))
results = report['results']
n_samples = len(results['ae'])
table['method'].extend([nicer_method(method)] * n_samples)
table['dataset'].extend([nicer_data(dataset)] * n_samples)
table['ae'].extend(results['ae'])
table['rae'].extend(results['rae'])
# table['c-CI'].extend(results['coverage'])
# table['a-CI'].extend(results['amplitude'])
# update_pickle_with_region(report, file, conf_name='CI-ILR', conf_region_class=ConfidenceIntervalsILR, bonferroni_correction=True)
# update_pickle_with_region(report, file, conf_name='CI-CLR', conf_region_class=ConfidenceIntervalsCLR, bonferroni_correction=True)
update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True)
update_pickle_with_region(report, file, conf_name='CInb', conf_region_class=ConfidenceIntervals, bonferroni_correction=False) # no Bonferroni-correction
# update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
# update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
# update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
conf_bonferroni = 'CI'
conf_name='CInb'
table['c-CI'].extend(report[f'coverage-{conf_bonferroni}']) # the true coverage is better measured with Bonferroni-correction
table['w-CI'].extend(report[f'winkler-{conf_name}'])
table['cs-CI'].extend(report[f'coverage-soft-{conf_name}'])
table['a-CI'].extend(report[f'amplitude-{conf_name}'])
# table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim'])) # not in this paper...
table['SRE'].extend(qp.error.sre(results['true-prevs'], results['point-estim'], report['train-prev'], eps=0.001))
# remove datasets with more than max_classes classes
# max_classes = 25
# min_train = 500
# ignore_datasets = ['poker_hand', 'hcv']
# for data_name, n in n_classes.items():
# if n > max_classes:
# df = df[df["dataset"] != data_name]
# for data_name, n in tr_size.items():
# if n < min_train:
# df = df[df["dataset"] != data_name]
# for data_name, n in tr_size.items():
# if data_name in ignore_datasets:
# df = df[df["dataset"] != data_name]
df = pd.DataFrame(table)
df['a-CI'] *= 100
df['c-CI'] *= 100
df['cs-CI'] *= 100
for region in ['CI']: #, 'CLR', 'ILR', 'CI']:
if problem_type == 'binary' and region=='ILR':
continue
for column in [f'a-{region}', 'ae', 'SRE', f'c-{region}', f'cs-{region}']: # f'w-{region}'
pv = pd.pivot_table(
df, index='dataset', columns='method', values=column, margins=True
)
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
#pv['tr-prev'] = pv.index.map(tr_prev)
pv = pv.drop(columns=[col for col in pv.columns if col == "All" or col[-1]=='All'])
print(f'{problem_type=} {column=}')
print(pv)
print('-'*80)
latex = LatexTable.from_dataframe(df, method='method', benchmark='dataset', value=column, name=column)
latex.format.configuration.show_std = False
#latex.reorder_methods([nicer_method(m) for m in methods])
latex.reorder_benchmarks([nicer_data(d) for d in dataset_order])
if column in ['ae', 'SRE']:
latex.format.configuration.lower_is_better = True
latex.format.configuration.stat_test = 'wilcoxon'
#latex.format.configuration.stat_test = None
# latex.format.configuration.show_std = True
if column in [f'c-{region}', f'cs-{region}']:
latex.format.configuration.lower_is_better = False
latex.format.configuration.stat_test = None
latex.format.configuration.with_color = False
latex.format.configuration.best_in_bold = False
latex.format.configuration.with_rank = False
latex.format.configuration.mean_prec = 0
latex.add_format_modifier(
format_modifier=FormatModifierSelectColor(
comparison=SelectGreaterThan(reference_selector=89, input_selector=SelectByName())
)
)
if column in [f'a-{region}']:
latex.format.configuration.lower_is_better = True
latex.format.configuration.stat_test = None
latex.format.configuration.with_color = False
latex.format.configuration.best_in_bold = False
latex.format.configuration.mean_prec = 2
latex.add_format_modifier(
format_modifier=FormatModifierSelectColor(
comparison=SelectSmallerThan(reference_selector=11, input_selector=SelectByName())
)
)
# latex.add_format_modifier(
# format_modifier=FormatModifierSelectColor(
# comparison=SelectSmallerThan(reference_selector=0.01, input_selector=SelectByName()),
# intensity=50
# )
# )
latex.format.configuration.resizebox=.5
latex.latexPDF(pdf_path=f'./tables/{latex.name}.pdf')
df = df[df['method']!='BaACC']
df = df[df['method']!='BaACC*']
df = df[df['method']!='BaACC+']
df = df[df['method']!='BaKDE-Ait*']
df = df[df['method']!='BaKDE-Gau*']
df = df[df['method']!='BayEMQ*']
grouped = df.groupby(["method", "dataset"])
agg = grouped.agg(
ae_mean=("ae", "mean"),
ae_std=("ae", "std"),
sre_mean=("SRE", "mean"),
sre_std=("SRE", "std"),
coverage_mean=("c-CI", "mean"),
coverage_std=("c-CI", "std"),
coverage_soft_mean=("cs-CI", "mean"),
amplitude_mean=("a-CI", "mean"),
amplitude_std=("a-CI", "std"),
).reset_index()
#plot_coverage_vs_amplitude(
# agg,
# coverage_col="coverage_soft_mean",
# amplitude_col="amplitude_mean",
# method_col="method",
# dataset_col="dataset",
# nominal_coverage=0.95,
# title="Marginal coverage vs amplitude"
#)
#print('RESTITUIR EL WILCOXON')