from pathlib import Path from sklearn.linear_model import LogisticRegression as LR from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy from BayesianKDEy._bayesian_mapls import BayesianMAPLS from BayesianKDEy.commons import experiment_path, KDEyCLR, FINEGRAINED, RESULT_DIR, DatasetHandler, \ UCIMulticlassHandler, LeQuaHandler from BayesianKDEy.temperature_calibration import temp_calibration from build.lib.quapy.data import LabelledCollection from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC from quapy.model_selection import GridSearchQ from quapy.data import Dataset from quapy.method.confidence import BayesianCC, AggregativeBootstrap from quapy.method.aggregative import KDEyML, ACC from quapy.protocol import UPP import numpy as np from tqdm import tqdm from collections import defaultdict from time import time def methods(): """ Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: - name: is a str representing the name of the method (e.g., 'BayesianKDEy') - quantifier: is the base model (e.g., KDEyML()) - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the quantifier with optimized hyperparameters """ if FINEGRAINED: lr_hyper = {'classifier__C': np.logspace(-4,4,9), 'classifier__class_weight': ['balanced', None]} acc_hyper = lr_hyper emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs'], **lr_hyper} hdy_hyper = {'nbins': [3,4,5,8,16,32], **lr_hyper} kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **lr_hyper} kdey_hyper_clr = {'bandwidth': np.logspace(-2, 2, 10), **lr_hyper} else: acc_hyper = {} emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']} hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} multiclass_method = 'multiclass' only_binary = 'only_binary' only_multiclass = 'only_multiclass' # Bootstrap approaches: # -------------------------------------------------------------------------------------------------------- #yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method # Bayesian approaches: # -------------------------------------------------------------------------------------------------------- # yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary # #yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method # yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method #yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(LR(), kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-Gau-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(LR(), kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-Ait-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(LR(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method # yield f'BaKDE-Gau-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(LR(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method yield 'BayEMQ-U-Temp1-2', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=1, exact_train_prev=True), multiclass_method yield 'BayEMQ-T*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=None, exact_train_prev=True), multiclass_method def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict): with qp.util.temp_seed(0): print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') # model selection if len(grid)>0: train, val_prot = dataset.get_train_valprot_for_modsel() mod_sel = GridSearchQ( model=point_quantifier, param_grid=grid, protocol=val_prot, refit=False, n_jobs=-1, verbose=True ).fit(*train.Xy) best_params = mod_sel.best_params_ else: best_params = {} return best_params def temperature_calibration(dataset: DatasetHandler, uncertainty_quantifier): if hasattr(uncertainty_quantifier, 'temperature') and uncertainty_quantifier.temperature is None: print('calibrating temperature') train, val_prot = dataset.get_train_valprot_for_modsel() temperature = temp_calibration(uncertainty_quantifier, train, val_prot, n_jobs=-1) uncertainty_quantifier.temperature = temperature def experiment(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, uncertainty_quant_constructor, hyper_choice_path: Path): with qp.util.temp_seed(0): # model selection best_hyperparams = qp.util.pickled_resource( hyper_choice_path, model_selection, dataset, cp(point_quantifier), grid ) t_init = time() uncertainty_quantifier = uncertainty_quant_constructor(best_hyperparams) temperature_calibration(dataset, uncertainty_quantifier) training, test_generator = dataset.get_train_testprot_for_eval() uncertainty_quantifier.fit(*training.Xy) tr_time = time() - t_init # test train_prevalence = training.prevalence() results = defaultdict(list) pbar = tqdm(enumerate(test_generator()), total=test_generator.total()) for i, (sample_X, true_prevalence) in pbar: t_init = time() point_estimate, region = uncertainty_quantifier.predict_conf(sample_X) ttime = time()-t_init results['true-prevs'].append(true_prevalence) results['point-estim'].append(point_estimate) results['shift'].append(qp.error.ae(true_prevalence, train_prevalence)) results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate)) results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate)) results['sre'].append(qp.error.sre(prevs_true=true_prevalence, prevs_hat=point_estimate, prevs_train=train_prevalence)) results['coverage'].append(region.coverage(true_prevalence)) results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) results['test-time'].append(ttime) results['samples'].append(region.samples) pbar.set_description(f'{method_name} MAE={np.mean(results["ae"]):.5f} W={np.mean(results["sre"]):.5f} Cov={np.mean(results["coverage"]):.5f} AMP={np.mean(results["amplitude"]):.5f}') report = { 'optim_hyper': best_hyperparams, 'train_time': tr_time, 'train-prev': train_prevalence, 'results': {k:np.asarray(v) for k,v in results.items()} } return report def check_skip_experiment(method_scope, dataset: DatasetHandler): if method_scope == 'only_binary' and not dataset.is_binary(): return True if method_scope == 'only_multiclass' and dataset.is_binary(): return True return False if __name__ == '__main__': result_dir = RESULT_DIR for data_handler in [LeQuaHandler]:#, UCIMulticlassHandler]: for dataset in data_handler.iter(): qp.environ['SAMPLE_SIZE'] = dataset.sample_size() print(f'dataset={dataset}') problem_type = 'binary' if dataset.is_binary() else 'multiclass' for method_name, surrogate_quant, hyper_params, withconf_constructor, method_scope in methods(): if check_skip_experiment(method_scope, dataset): continue result_path = experiment_path(result_dir / problem_type, dataset.name(), method_name) hyper_path = experiment_path(result_dir / 'hyperparams' / problem_type, dataset.name(), surrogate_quant.__class__.__name__) report = qp.util.pickled_resource( result_path, experiment, dataset, surrogate_quant, method_name, hyper_params, withconf_constructor, hyper_path ) print(f'dataset={dataset}, ' f'method={method_name}: ' f'mae={report["results"]["ae"].mean():.5f}, ' f'W={report["results"]["sre"].mean():.5f}, ' f'coverage={report["results"]["coverage"].mean():.5f}, ' f'amplitude={report["results"]["amplitude"].mean():.5f}, ')