from collections import defaultdict import pandas as pd import model_selection import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy from BayesianKDEy.temperature_calibration import temp_calibration from commons import * from data import Dataset from protocol import DirichletProtocol from quapy.method.confidence import BayesianCC from quapy.method.aggregative import ACC, AggregativeQuantifier from sklearn.linear_model import LogisticRegression as LR from copy import deepcopy as cp from tqdm import tqdm from full_experiments import model_selection from itertools import chain def select_imbalanced_datasets(top_m=10): datasets_prevs = [] # choose top-m imbalanced datasets for data_name in multiclass['datasets']: data_prev = multiclass['fetch_fn'](data_name).training.prevalence() balance = normalized_entropy(data_prev) datasets_prevs.append((data_name, balance)) datasets_prevs.sort(key=lambda x: x[1]) data_selected = [data_name for data_name, balance in datasets_prevs[:top_m]] return data_selected def methods(): acc_hyper = {} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0, prior='uniform') yield f'BaKDE-Ait', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, prior='uniform', **hyper) yield f'BaKDE-Ait-T2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=2., prior='uniform', **hyper) yield f'BaKDE-Ait-T1', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=1., prior='uniform', **hyper) def run_test(test, alpha_test, alpha_train, concentration, prior_type, bay_quant, train_prev, dataset_name, method_name, results): test_generator = DirichletProtocol(test, alpha=alpha_test, repeats=100, random_state=0) for i, (sample_X, true_prev) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} {prior_type} alpha with {concentration=}'): estim_prev, region = bay_quant.predict_conf(sample_X) results['dataset'].append(dataset_name) results['method_name'].append(method_name) results['prior-type'].append(prior_type) results['train-prev'].append(train_prev) results['concentration'].append(concentration) results['train-alpha'].append(alpha_train) results['test-alpha'].append(alpha_test) results['true-prevs'].append(true_prev) results['point-estim'].append(estim_prev) results['shift'].append(qp.error.ae(true_prev, train_prev)) results['ae'].append(qp.error.ae(prevs_true=true_prev, prevs_hat=estim_prev)) results['sre'].append(qp.error.sre(prevs_true=true_prev, prevs_hat=estim_prev, prevs_train=train_prev)) results['rae'].append(qp.error.rae(prevs_true=true_prev, prevs_hat=estim_prev)) results['coverage'].append(region.coverage(true_prev)) results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) results['samples'].append(region.samples) def experiment(dataset: Dataset, dataset_name: str, point_quantifier: AggregativeQuantifier, grid: dict, bay_constructor, method_name:str, hyper_choice_path: Path): with qp.util.temp_seed(0): training, test = dataset.train_test # model selection best_hyperparams = qp.util.pickled_resource( hyper_choice_path, model_selection, training, cp(point_quantifier), grid ) bay_quant = bay_constructor(best_hyperparams) if hasattr(bay_quant, 'temperature') and bay_quant.temperature is None: train, val = data.training.split_stratified(train_prop=0.6, random_state=0) temperature = temp_calibration(bay_quant, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1) bay_quant.temperature = temperature bay_quant.fit(*training.Xy) # test train_prev = training.prevalence() results = defaultdict(list) for concentration in [50, 500, 5_000]: alpha_train = train_prev * concentration bay_quant.prior = alpha_train # informative prior alpha_test_informative = alpha_train prior_type = 'informative' run_test(test, alpha_test_informative, alpha_train, concentration, prior_type, bay_quant, train_prev, dataset_name, method_name, results) # informative prior alpha_test_wrong = antagonistic_prevalence(train_prev, strength=0.25) * concentration prior_type = 'wrong' run_test(test, alpha_test_wrong, alpha_train, concentration, prior_type, bay_quant, train_prev, dataset_name, method_name, results) return results def concat_reports(reports): final_report = { k: list(chain.from_iterable(report[k] for report in reports)) for k in reports[0] } df = pd.DataFrame(final_report) return df def error_vs_concentration_plot(df, err='ae', save_path=None): import seaborn as sns import matplotlib.pyplot as plt sns.set_theme(style="whitegrid", context="paper") fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True) for ax, prior in zip(axes, ['informative', 'wrong']): sub = df[df['prior-type'] == prior] sns.lineplot( data=sub, x='concentration', y=err, hue='method_name', marker='o', errorbar='se', # o 'sd' ax=ax ) ax.set_xscale('log') ax.set_title(f'Prior: {prior}') ax.set_xlabel('Concentration') ax.set_ylabel('M'+err.upper()) plt.tight_layout() if save_path is None: plt.show() else: os.makedirs(Path(save_path).parent, exist_ok=True) plt.savefig(save_path) def coverage_vs_concentration_plot(df, save_path=None): import seaborn as sns import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True) for ax, prior in zip(axes, ['informative', 'wrong']): sub = df[df['prior-type'] == prior] sns.lineplot( data=sub, x='concentration', y='coverage', hue='method_name', marker='o', errorbar='se', ax=ax ) ax.set_xscale('log') ax.set_ylim(0, 1.05) ax.set_title(f'Prior: {prior}') ax.set_xlabel('Concentration') ax.set_ylabel('Coverage') plt.tight_layout() if save_path is None: plt.show() else: os.makedirs(Path(save_path).parent, exist_ok=True) plt.savefig(save_path) def amplitude_vs_concentration_plot(df, save_path=None): import seaborn as sns import matplotlib.pyplot as plt fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True) for ax, prior in zip(axes, ['informative', 'wrong']): sub = df[df['prior-type'] == prior] sns.lineplot( data=sub, x='concentration', y='amplitude', hue='method_name', marker='o', errorbar='se', ax=ax ) ax.set_xscale('log') ax.set_title(f'Prior: {prior}') ax.set_xlabel('Concentration') ax.set_ylabel('Amplitude') plt.tight_layout() if save_path is None: plt.show() else: os.makedirs(Path(save_path).parent, exist_ok=True) plt.savefig(save_path) def coverage_vs_amplitude_plot(df, save_path=None): import seaborn as sns import matplotlib.pyplot as plt agg = ( df .groupby(['prior-type', 'method_name', 'concentration']) .agg( coverage=('coverage', 'mean'), amplitude=('amplitude', 'mean') ) .reset_index() ) fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True) for ax, prior in zip(axes, ['informative', 'wrong']): sub = agg[agg['prior-type'] == prior] sns.scatterplot( data=sub, x='amplitude', y='coverage', hue='method_name', style='concentration', s=80, ax=ax ) ax.set_ylim(0, 1.05) ax.set_title(f'Prior: {prior}') ax.set_xlabel('Amplitude') ax.set_ylabel('Coverage') ax.axhline(0.95, linestyle='--', color='gray', alpha=0.7) plt.tight_layout() if save_path is None: plt.show() else: os.makedirs(Path(save_path).parent, exist_ok=True) plt.savefig(save_path) if __name__ == '__main__': result_dir = Path('./results/prior_effect') selected = select_imbalanced_datasets(10) print(f'selected datasets={selected}') qp.environ['SAMPLE_SIZE'] = multiclass['sample_size'] reports = [] for data_name in selected: data = multiclass['fetch_fn'](data_name) for method_name, surrogate_quant, hyper_params, bay_constructor in methods(): result_path = experiment_path(result_dir, data_name, method_name) hyper_path = experiment_path(result_dir/'hyperparams', data_name, surrogate_quant.__class__.__name__) print(f'Launching {method_name} in dataset {data_name}') report = qp.util.pickled_resource( result_path, experiment, data, data_name, surrogate_quant, hyper_params, bay_constructor, method_name, hyper_path ) reports.append(report) # concat all reports as a dataframe df = concat_reports(reports) # for data_name in selected: # print(data_name) # df_ = df[df['dataset']==data_name] df_ = df error_vs_concentration_plot(df_, save_path='./plots/prior_effect/error_vs_concentration.pdf') coverage_vs_concentration_plot(df_, save_path='./plots/prior_effect/coverage_vs_concentration.pdf') amplitude_vs_concentration_plot(df_, save_path='./plots/prior_effect/amplitude_vs_concentration.pdf') coverage_vs_amplitude_plot(df_, save_path='./plots/prior_effect/coverage_vs_amplitude.pdf')