from pathlib import Path from sklearn.linear_model import LogisticRegression as LR from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy from BayesianKDEy._bayesian_mapls import BayesianMAPLS from BayesianKDEy.commons import multiclass, experiment_path, KDEyCLR from BayesianKDEy.temperature_calibration import temp_calibration from build.lib.quapy.data import LabelledCollection from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC from quapy.model_selection import GridSearchQ from quapy.data import Dataset # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot from quapy.method.confidence import BayesianCC, AggregativeBootstrap from quapy.method.aggregative import KDEyML, ACC from quapy.protocol import UPP import numpy as np from tqdm import tqdm from collections import defaultdict from time import time def methods(): """ Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: - name: is a str representing the name of the method (e.g., 'BayesianKDEy') - quantifier: is the base model (e.g., KDEyML()) - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the quantifier with optimized hyperparameters """ acc_hyper = {} emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']} hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} multiclass_method = 'multiclass' only_binary = 'only_binary' only_multiclass = 'only_multiclass' yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass # yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method # yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method # yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method # yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method yield f'BaKDE-Ait-numpyro-T*-U', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, prior='uniform', **hyper), multiclass_method # yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method # yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method # yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield 'BayEMQ', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', exact_train_prev=True), multiclass_method # yield 'BayEMQ*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', exact_train_prev=True), multiclass_method # yield 'BayEMQ*2', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', exact_train_prev=True), multiclass_method # yield 'BayEMQ*2T*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=None, exact_train_prev=True), multiclass_method # yield 'BayEMQ*2T01', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=0.1, exact_train_prev=True), multiclass_method # yield 'BayEMQ*2T10000', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=10000, exact_train_prev=True), multiclass_method # yield 'BayEMQ*2T100000', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=100000, # exact_train_prev=True), multiclass_method # yield 'BayEMQ-U-Temp1-2', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=1, exact_train_prev=True), multiclass_method yield 'BayEMQ-U-Temp*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=None, exact_train_prev=True), multiclass_method # yield 'BayEMQ*Temp1', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=1, exact_train_prev=True), multiclass_method # yield 'BayEMQ*Temp10', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=10, exact_train_prev=True), multiclass_method # yield 'BayEMQ*Temp100', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=100, exact_train_prev=True), multiclass_method # yield 'BayEMQ*Temp1000', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=1000, exact_train_prev=True), multiclass_method def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): with qp.util.temp_seed(0): print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') # model selection if len(grid)>0: train, val = train.split_stratified(train_prop=0.6, random_state=0) mod_sel = GridSearchQ( model=point_quantifier, param_grid=grid, protocol=qp.protocol.UPP(val, repeats=250, random_state=0), refit=False, n_jobs=-1, verbose=True ).fit(*train.Xy) best_params = mod_sel.best_params_ else: best_params = {} return best_params def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path): with qp.util.temp_seed(0): training, test = dataset.train_test # model selection best_hyperparams = qp.util.pickled_resource( hyper_choice_path, model_selection, training, cp(point_quantifier), grid ) t_init = time() withconf_quantifier = withconf_constructor(best_hyperparams) if hasattr(withconf_quantifier, 'temperature') and withconf_quantifier.temperature is None: print('calibrating temperature') train, val = data.training.split_stratified(train_prop=0.6, random_state=0) temperature = temp_calibration(withconf_quantifier, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1) withconf_quantifier.temperature = temperature withconf_quantifier.fit(*training.Xy) tr_time = time() - t_init # test train_prevalence = training.prevalence() results = defaultdict(list) test_generator = UPP(test, repeats=100, random_state=0) pbar = tqdm(enumerate(test_generator()), total=test_generator.total()) for i, (sample_X, true_prevalence) in pbar: t_init = time() point_estimate, region = withconf_quantifier.predict_conf(sample_X) ttime = time()-t_init results['true-prevs'].append(true_prevalence) results['point-estim'].append(point_estimate) results['shift'].append(qp.error.ae(true_prevalence, train_prevalence)) results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate)) results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate)) results['coverage'].append(region.coverage(true_prevalence)) results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) results['test-time'].append(ttime) results['samples'].append(region.samples) pbar.set_description(f'{method_name} MAE={np.mean(results["ae"]):.5f} Cov={np.mean(results["coverage"]):.5f} AMP={np.mean(results["amplitude"]):.5f}') report = { 'optim_hyper': best_hyperparams, 'train_time': tr_time, 'train-prev': train_prevalence, 'results': {k:np.asarray(v) for k,v in results.items()} } return report if __name__ == '__main__': result_dir = Path('./results') for setup in [multiclass]: # [binary, multiclass]: qp.environ['SAMPLE_SIZE'] = setup['sample_size'] for data_name in setup['datasets']: print(f'dataset={data_name}') # if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): # print(f'skipping dataset: {data_name}') # continue data = setup['fetch_fn'](data_name) is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') for method_name, surrogate_quant, hyper_params, withconf_constructor, method_scope in methods(): if method_scope == 'only_binary' and not is_binary: continue if method_scope == 'only_multiclass' and is_binary: continue result_path = experiment_path(result_subdir, data_name, method_name) hyper_path = experiment_path(hyper_subdir, data_name, surrogate_quant.__class__.__name__) report = qp.util.pickled_resource( result_path, experiment, data, surrogate_quant, method_name, hyper_params, withconf_constructor, hyper_path ) print(f'dataset={data_name}, ' f'method={method_name}: ' f'mae={report["results"]["ae"].mean():.5f}, ' f'coverage={report["results"]["coverage"].mean():.5f}, ' f'amplitude={report["results"]["amplitude"].mean():.5f}, ')