from pathlib import Path from sklearn.linear_model import LogisticRegression from copy import deepcopy as cp import quapy as qp from BayesianKDEy.commons import KDEyReduce from BayesianKDEy.methods import get_experimental_methods, MethodDescriptor from _bayeisan_kdey import BayesianKDEy from _bayesian_mapls import BayesianMAPLS from commons import experiment_path, KDEyCLR, RESULT_DIR, MockClassifierFromPosteriors, KDEyScaledB, KDEyFresh # import datasets from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, VisualDataHandler, CIFAR100Handler from temperature_calibration import temp_calibration from build.lib.quapy.data import LabelledCollection from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC from quapy.model_selection import GridSearchQ from quapy.data import Dataset from quapy.method.confidence import BayesianCC, AggregativeBootstrap from quapy.method.aggregative import KDEyML, ACC from quapy.protocol import UPP import numpy as np from tqdm import tqdm from collections import defaultdict from time import time def methods___depr(): """ Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: - name: is a str representing the name of the method (e.g., 'BayesianKDEy') - quantifier: is the base model (e.g., KDEyML()) - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the quantifier with optimized hyperparameters """ Cls = LogisticRegression cls_hyper = {'classifier__C': np.logspace(-4,4,9), 'classifier__class_weight': ['balanced', None]} val_split = 5 # k-fold cross-validation cc_hyper = cls_hyper acc_hyper = cls_hyper # emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs'], **cls_hyper} hdy_hyper = {'nbins': [3,4,5,8,16,32], **cls_hyper} kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **cls_hyper} kdey_hyper_clr = {'bandwidth': np.logspace(-2, 2, 10), **cls_hyper} band ={'bandwidth':np.logspace(-3,-1,10)} multiclass_method = 'multiclass' only_binary = 'only_binary' only_multiclass = 'only_multiclass' # surrogate quantifiers cc = CC(Cls()) acc = ACC(Cls(), val_split=val_split) hdy = DMy(Cls(), val_split=val_split) kde_gau = KDEyML(Cls(), val_split=val_split) kde_gau_scale = KDEyScaledB(Cls(), val_split=val_split) kde_gau_pca = KDEyReduce(Cls(), val_split=val_split, n_components=5) kde_gau_pca10 = KDEyReduce(Cls(), val_split=val_split, n_components=10) kde_ait = KDEyCLR(Cls(), val_split=val_split) emq = EMQ(Cls(), exact_train_prev=False, val_split=val_split) # Bootstrap approaches: # -------------------------------------------------------------------------------------------------------- # yield 'BootstrapCC', cc, cc_hyper, lambda hyper: AggregativeBootstrap(CC(Cls()), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapACC', acc, acc_hyper, lambda hyper: _AggregativeBootstrap(ACC(Cls()), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapEMQ', emq, on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: _AggregativeBootstrap(EMQ(Cls(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapHDy', hdy, hdy_hyper, lambda hyper: _AggregativeBootstrap(DMy(Cls(), **hyper), n_test_samples=1000, random_state=0), multiclass_method #yield 'BootstrapKDEy', kde_gau, kdey_hyper, lambda hyper: _AggregativeBootstrap(KDEyML(Cls(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method # Bayesian approaches: (*=temp calibration auto threshold and coverage sim to nominal; +=temp calibration w/o amplitude coverage, for winkler criterion, !=same but alpha=0.005 for winkler) # -------------------------------------------------------------------------------------------------------- # yield 'BayesianACC', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, mcmc_seed=0), multiclass_method # yield 'BayesianACC*', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, temperature=None, mcmc_seed=0), multiclass_method # yield 'BayesianACC+', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, temperature=None, mcmc_seed=0), multiclass_method # yield 'BayesianACC!', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, temperature=None, mcmc_seed=0), multiclass_method #yield 'BayesianHDy', hdy, hdy_hyper, lambda hyper: PQ(Cls(), val_split=val_split, stan_seed=0, **hyper), only_binary # yield f'BaKDE-Ait-numpyro', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(), kernel='aitchison', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-numpyro', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-scale', kde_gau_scale, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-pca5', kde_gau_pca, band, lambda hyper: BayesianKDEy(Cls(), reduce=5, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-pca5*', kde_gau_pca, band, lambda hyper: BayesianKDEy(Cls(), reduce=5, temperature=None, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-pca10', kde_gau_pca10, band, lambda hyper: BayesianKDEy(Cls(), reduce=10, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-pca10*', kde_gau_pca10, band, lambda hyper: BayesianKDEy(Cls(), reduce=10, temperature=None, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method # yield f'BaKDE-Gau-H0', KDEyFresh(Cls(), bandwidth=0.4), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=0.4, kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-Gau-H1', KDEyFresh(Cls(), bandwidth=1.), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=1., kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method # yield f'BaKDE-Gau-H2', KDEyFresh(Cls(), bandwidth=1.5), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=1.5, # kernel='gaussian', # mcmc_seed=0, # engine='numpyro', # **hyper), multiclass_method # yield f'BaKDE-Ait-T*', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method # yield f'BaKDE-Ait-T!', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-T*', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-T!', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method # yield 'BayEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BayEMQ*', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BayEMQ!', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BaEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(**{k.replace('classifier__', ''): v for k, v in hyper.items()}), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BaACC!', acc, acc_hyper, lambda hyper: BayesianCC(Cls(**{k.replace('classifier__', ''): v for k, v in hyper.items()}), temperature=None, mcmc_seed=0), multiclass_method # yield 'BaEMQ!', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(**{k.replace('classifier__', ''): v for k, v in hyper.items()}), prior='uniform', temperature=None, exact_train_prev=False), multiclass_method def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict): with qp.util.temp_seed(0): point_quantifier = cp(point_quantifier) print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') # model selection if len(grid)>0: train, val_prot = dataset.get_train_valprot_for_modsel() mod_sel = GridSearchQ( model=point_quantifier, param_grid=grid, protocol=val_prot, refit=False, n_jobs=-1, verbose=True ).fit(*train.Xy) best_params = mod_sel.best_params_ else: best_params = {} return best_params def temperature_calibration(dataset: DatasetHandler, uncertainty_quantifier): temperature = None if hasattr(uncertainty_quantifier, 'temperature'): if uncertainty_quantifier.temperature is None: print('calibrating temperature') train, val_prot = dataset.get_train_valprot_for_modsel() temp_grid=[1., .5, 1.5, 2., 5., 10., 100., 1000.] temperature = temp_calibration(uncertainty_quantifier, train, val_prot, temp_grid=temp_grid, n_jobs=-1, amplitude_threshold=1., criterion='winkler') uncertainty_quantifier.temperature = temperature else: temperature = uncertainty_quantifier.temperature return temperature def experiment(dataset: DatasetHandler, method: MethodDescriptor, hyper_choice_path: Path): with qp.util.temp_seed(0): # model selection best_hyperparams = qp.util.pickled_resource( hyper_choice_path, model_selection, dataset, method.surrogate_quantifier(), method.hyper_parameters ) print(f'{best_hyperparams=}') t_init = time() uncertainty_quantifier = method.uncertainty_aware_quantifier(best_hyperparams) temperature = temperature_calibration(dataset, uncertainty_quantifier) training, test_generator = dataset.get_train_testprot_for_eval() uncertainty_quantifier.fit(*training.Xy) tr_time = time() - t_init # test train_prevalence = training.prevalence() results = defaultdict(list) pbar = tqdm(enumerate(test_generator()), total=test_generator.total()) for i, (sample_X, true_prevalence) in pbar: t_init = time() point_estimate, region = uncertainty_quantifier.predict_conf(sample_X) ttime = time()-t_init results['true-prevs'].append(true_prevalence) results['point-estim'].append(point_estimate) results['shift'].append(qp.error.ae(true_prevalence, train_prevalence)) results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate)) results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate)) results['sre'].append(qp.error.sre(prevs_true=true_prevalence, prevs_hat=point_estimate, prevs_train=train_prevalence)) results['coverage'].append(region.coverage(true_prevalence)) results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) results['test-time'].append(ttime) results['samples'].append(region.samples) pbar.set_description( f'{method.name} ' f'MAE={np.mean(results["ae"]):.5f} ' f'W={np.mean(results["sre"]):.5f} ' f'Cov={np.mean(results["coverage"]):.5f} ' f'AMP={np.mean(results["amplitude"]):.5f}' ) report = { 'optim_hyper': best_hyperparams, 'train_time': tr_time, 'train-prev': train_prevalence, 'results': {k:np.asarray(v) for k,v in results.items()}, 'temperature': temperature } return report if __name__ == '__main__': result_dir = RESULT_DIR for data_handler in [LeQuaHandler]:#, UCIMulticlassHandler, LeQuaHandler, VisualDataHandler, CIFAR100Handler]: for dataset in data_handler.iter(): qp.environ['SAMPLE_SIZE'] = dataset.sample_size print(f'dataset={dataset.name}') problem_type = 'binary' if dataset.is_binary() else 'multiclass' for method in get_experimental_methods(): # skip combination? if method.binary_only() and not dataset.is_binary(): continue result_path = experiment_path(result_dir / problem_type, dataset.name, method.name) hyper_path = experiment_path(result_dir / 'hyperparams' / problem_type, dataset.name, method.surrogate_quantifier_name()) report = qp.util.pickled_resource( result_path, experiment, dataset, method, hyper_path ) print(f'dataset={dataset.name}, ' f'method={method.name}: ' f'mae={report["results"]["ae"].mean():.5f}, ' f'W={report["results"]["sre"].mean():.5f}, ' f'coverage={report["results"]["coverage"].mean():.5f}, ' f'amplitude={report["results"]["amplitude"].mean():.5f}, ')