diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt index c75a2e1..f5be7d2 100644 --- a/BayesianKDEy/TODO.txt +++ b/BayesianKDEy/TODO.txt @@ -21,3 +21,4 @@ - consider W as a measure of quantification error (the current e.g., w-CI is the winkler...) - optimize also C and class_weight? [I don't think so, but could be done easily now] +- remove wikis from repo \ No newline at end of file diff --git a/BayesianKDEy/commons.py b/BayesianKDEy/commons.py new file mode 100644 index 0000000..c1abfc1 --- /dev/null +++ b/BayesianKDEy/commons.py @@ -0,0 +1,79 @@ +import os +from pathlib import Path + +from sklearn.base import BaseEstimator + +import quapy as qp +import numpy as np + +from method.aggregative import KDEyML +from quapy.functional import l1_norm, ILRtransformation +from scipy.stats import entropy + + + +def fetch_UCI_multiclass(data_name): + return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01) + + +def fetch_UCI_binary(data_name): + return qp.datasets.fetch_UCIBinaryDataset(data_name) + +# global configurations + +binary = { + 'datasets': qp.datasets.UCI_BINARY_DATASETS, + 'fetch_fn': fetch_UCI_binary, + 'sample_size': 500 +} + +multiclass = { + 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, + 'fetch_fn': fetch_UCI_multiclass, + 'sample_size': 1000 +} +multiclass['datasets'].remove('poker_hand') # random performance +multiclass['datasets'].remove('hcv') # random performance +multiclass['datasets'].remove('letter') # many classes +multiclass['datasets'].remove('isolet') # many classes + + +# utils +def experiment_path(dir:Path, dataset_name:str, method_name:str): + os.makedirs(dir, exist_ok=True) + return dir/f'{dataset_name}__{method_name}.pkl' + + +def normalized_entropy(p): + """ + Normalized Shannon entropy in [0, 1] + p: array-like, prevalence vector (sums to 1) + """ + p = np.asarray(p) + H = entropy(p) # Shannon entropy + H_max = np.log(len(p)) + return np.clip(H / H_max, 0, 1) + + +def antagonistic_prevalence(p, strength=1): + ilr = ILRtransformation() + z = ilr(p) + z_ant = - strength * z + p_ant = ilr.inverse(z_ant) + return p_ant + + +class KDEyCLR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='aitchison' + ) + + +class KDEyILR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='ilr' + ) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index c2ac1c8..3a0c9e9 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -1,48 +1,23 @@ -import os -import warnings -from os.path import join from pathlib import Path -from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression as LR -from sklearn.model_selection import GridSearchCV, StratifiedKFold from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.commons import multiclass, experiment_path, KDEyCLR from BayesianKDEy.temperature_calibration import temp_calibration from build.lib.quapy.data import LabelledCollection from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ -from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot -from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap -from quapy.functional import strprev +from quapy.method.confidence import BayesianCC, AggregativeBootstrap from quapy.method.aggregative import KDEyML, ACC from quapy.protocol import UPP -import quapy.functional as F import numpy as np from tqdm import tqdm -from scipy.stats import dirichlet from collections import defaultdict from time import time -from sklearn.base import clone, BaseEstimator - - -class KDEyCLR(KDEyML): - def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): - super().__init__( - classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, - random_state=random_state, kernel='aitchison' - ) - - -class KDEyILR(KDEyML): - def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): - super().__init__( - classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, - random_state=random_state, kernel='ilr' - ) def methods(): @@ -160,33 +135,8 @@ def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method return report -def experiment_path(dir:Path, dataset_name:str, method_name:str): - os.makedirs(dir, exist_ok=True) - return dir/f'{dataset_name}__{method_name}.pkl' - - -def fetch_UCI_binary(data_name): - return qp.datasets.fetch_UCIBinaryDataset(data_name) - - -def fetch_UCI_multiclass(data_name): - return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01) - - if __name__ == '__main__': - binary = { - 'datasets': qp.datasets.UCI_BINARY_DATASETS, - 'fetch_fn': fetch_UCI_binary, - 'sample_size': 500 - } - - multiclass = { - 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, - 'fetch_fn': fetch_UCI_multiclass, - 'sample_size': 1000 - } - result_dir = Path('./results') for setup in [multiclass]: # [binary, multiclass]: @@ -200,15 +150,15 @@ if __name__ == '__main__': is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') - for method_name, method, hyper_params, withconf_constructor, method_scope in methods(): + for method_name, surrogate_quant, hyper_params, withconf_constructor, method_scope in methods(): if method_scope == 'only_binary' and not is_binary: continue if method_scope == 'only_multiclass' and is_binary: continue result_path = experiment_path(result_subdir, data_name, method_name) - hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + hyper_path = experiment_path(hyper_subdir, data_name, surrogate_quant.__class__.__name__) report = qp.util.pickled_resource( - result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path + result_path, experiment, data, surrogate_quant, method_name, hyper_params, withconf_constructor, hyper_path ) print(f'dataset={data_name}, ' f'method={method_name}: ' diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 0e9b90c..6a018a2 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -7,7 +7,7 @@ import pandas as pd from glob import glob from pathlib import Path import quapy as qp -from BayesianKDEy.full_experiments import fetch_UCI_multiclass, fetch_UCI_binary +from BayesianKDEy.commons import fetch_UCI_binary, fetch_UCI_multiclass from error import dist_aitchison from quapy.method.confidence import ConfidenceIntervals from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index ec752be..91d8b9b 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -7,6 +7,7 @@ import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from scipy.stats import gaussian_kde +from BayesianKDEy.commons import antagonistic_prevalence from method.confidence import (ConfidenceIntervals as CI, ConfidenceEllipseSimplex as CE, ConfidenceEllipseCLR as CLR, @@ -302,6 +303,7 @@ def plot_simplex( plt.tight_layout() if save_path: + os.makedirs(Path(save_path).parent, exist_ok=True) plt.savefig(save_path) else: plt.show() @@ -377,17 +379,17 @@ if __name__ == '__main__': # ], # save_path=f'./plots/prior_test/uniform.png' # ) - - alpha = [40, 10, 10] - train_prevs = np.random.dirichlet(alpha=alpha, size=n) - test_prevs = np.random.dirichlet(alpha=alpha, size=n) - plot_simplex( - point_layers=[ - {"points": train_prevs, "label": "train", "style": train_style}, - {"points": test_prevs, "label": "test", "style": test_style}, - ], - save_path=f'./plots/prior_test/informative.png' - ) + # + # alpha = [40, 10, 10] + # train_prevs = np.random.dirichlet(alpha=alpha, size=n) + # test_prevs = np.random.dirichlet(alpha=alpha, size=n) + # plot_simplex( + # point_layers=[ + # {"points": train_prevs, "label": "train", "style": train_style}, + # {"points": test_prevs, "label": "test", "style": test_style}, + # ], + # save_path=f'./plots/prior_test/informative.png' + # ) # train_prevs = np.random.dirichlet(alpha=[8, 1, 1], size=n) # test_prevs = np.random.dirichlet(alpha=[1, 8, 1], size=n) @@ -402,13 +404,16 @@ if __name__ == '__main__': p = 0.6 K = 3 - alpha = [p] + [(1. - p) / (K - 1)] * (K - 1) + # alpha = [p] + [(1. - p) / (K - 1)] * (K - 1) + alpha = [0.095, 0.246, 0.658] alpha = np.array(alpha) - for c in [100, 500, 1_000]: - alpha_c = alpha * c - train_prevs = np.random.dirichlet(alpha=alpha_c, size=n) - test_prevs = np.random.dirichlet(alpha=alpha_c[::-1], size=n) + + for c in [50, 500, 5_000]: + alpha_tr = alpha * c + alpha_te = antagonistic_prevalence(alpha, strength=1) * c + train_prevs = np.random.dirichlet(alpha=alpha_tr, size=n) + test_prevs = np.random.dirichlet(alpha=alpha_te, size=n) plot_simplex( point_layers=[ {"points": train_prevs, "label": "train", "style": train_style}, diff --git a/BayesianKDEy/prior_effect.py b/BayesianKDEy/prior_effect.py index 6e0462b..0731475 100644 --- a/BayesianKDEy/prior_effect.py +++ b/BayesianKDEy/prior_effect.py @@ -1,11 +1,125 @@ -import numpy as np +from collections import defaultdict -n = 3 +import model_selection +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.temperature_calibration import temp_calibration +from commons import * +from data import Dataset +from protocol import DirichletProtocol +from quapy.method.confidence import BayesianCC +from quapy.method.aggregative import ACC, AggregativeQuantifier +from sklearn.linear_model import LogisticRegression as LR +from copy import deepcopy as cp +from tqdm import tqdm -p = 0.5 -alpha = [p] + [(1.-p)/(n-1)]*(n-1) -alpha = np.array(alpha) +def select_imbalanced_datasets(top_m=5): + datasets_prevs = [] + # choose top-m imbalanced datasets + for data_name in multiclass['datasets']: + data_prev = multiclass['fetch_fn'](data_name).training.prevalence() + balance = normalized_entropy(data_prev) + datasets_prevs.append((data_name, balance)) + datasets_prevs.sort(key=lambda x: x[1]) + print(datasets_prevs) + data_selected = [data_name for data_name, balance in datasets_prevs[:top_m]] + return data_selected -for c in [1_000, 5_000, 10_000]: - print(alpha*c) \ No newline at end of file + +def methods(): + acc_hyper = {} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + #yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0) + yield f'BaKDE-Ait', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', + mcmc_seed=0, + engine='numpyro', + temperature=None, + prior='uniform', + **hyper) + + +def run_test(test, alpha_test, alpha_train, concentration, prior_type, bay_quant, train_prev, results): + test_generator = DirichletProtocol(test, alpha=alpha_test, repeats=100, random_state=0) + for i, (sample_X, true_prev) in tqdm(enumerate(test_generator()), total=test_generator.total(), + desc=f'{method_name} informative alpha with {concentration=}'): + estim_prev, region = bay_quant.predict_conf(sample_X) + + results['prior-type'].append(prior_type) + results['train-prev'].append(train_prev) + results['concentration'].append(concentration) + results['train-alpha'].append(alpha_train) + results['test-alpha'].append(alpha_test) + results['true-prevs'].append(true_prev) + results['point-estim'].append(estim_prev) + results['shift'].append(qp.error.ae(true_prev, train_prev)) + results['ae'].append(qp.error.ae(prevs_true=true_prev, prevs_hat=estim_prev)) + results['sre'].append(qp.error.sre(prevs_true=true_prev, prevs_hat=estim_prev, prevs_train=train_prev)) + results['rae'].append(qp.error.rae(prevs_true=true_prev, prevs_hat=estim_prev)) + results['coverage'].append(region.coverage(true_prev)) + results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) + results['samples'].append(region.samples) + + +def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, grid: dict, bay_constructor, hyper_choice_path: Path): + with qp.util.temp_seed(0): + + training, test = dataset.train_test + + # model selection + best_hyperparams = qp.util.pickled_resource( + hyper_choice_path, model_selection, training, cp(point_quantifier), grid + ) + + bay_quant = bay_constructor(best_hyperparams) + if hasattr(bay_quant, 'temperature') and bay_quant.temperature is None: + train, val = data.training.split_stratified(train_prop=0.6, random_state=0) + temperature = temp_calibration(bay_quant, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1) + bay_quant.temperature = temperature + bay_quant.fit(*training.Xy) + + # test + train_prev = training.prevalence() + results = defaultdict(list) + + for concentration in [50, 500, 5_000]: + alpha_train = train_prev * concentration + bay_quant.prior = alpha_train + + # informative prior + alpha_test_informative = alpha_train + prior_type = 'informative' + run_test(test, alpha_test_informative, alpha_train, concentration, prior_type, bay_quant, train_prev, results) + + # informative prior + alpha_test_wrong = antagonistic_prevalence(train_prev, strength=1) * concentration + prior_type = 'wrong' + run_test(test, alpha_test_wrong, alpha_train, concentration, prior_type, bay_quant, train_prev, results) + + report = { + 'optim_hyper': best_hyperparams, + 'train-prev': train_prev, + 'results': {k: np.asarray(v) for k, v in results.items()} + } + + return report + + +if __name__ == '__main__': + result_dir = Path('./results/prior_effect') + selected = select_imbalanced_datasets() + qp.environ['SAMPLE_SIZE'] = multiclass['sample_size'] + for data_name in selected: + data = multiclass['fetch_fn'](data_name) + for method_name, surrogate_quant, hyper_params, bay_constructor, method_scope in methods(): + result_path = experiment_path(result_dir, data_name, method_name) + hyper_path = experiment_path(result_dir/'hyperparams', data_name, surrogate_quant.__class__.__name__) + + print(f'Launching {method_name} in dataset {data_name}') + experiment(dataset=data, + point_quantifier=surrogate_quant, + grid=hyper_params, + bay_constructor=bay_constructor, + hyper_choice_path=hyper_path) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index 18a8f8f..38f345f 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -10,7 +10,8 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy -from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR +from BayesianKDEy.full_experiments import experiment +from BayesianKDEy.commons import experiment_path, KDEyCLR from build.lib.quapy.data import LabelledCollection from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier from quapy.method.base import BinaryQuantifier, BaseQuantifier diff --git a/BayesianKDEy/temperature_bandwidth_corr.py b/BayesianKDEy/temperature_bandwidth_corr.py index 562649f..5095837 100644 --- a/BayesianKDEy/temperature_bandwidth_corr.py +++ b/BayesianKDEy/temperature_bandwidth_corr.py @@ -5,7 +5,7 @@ from pathlib import Path import pandas as pd import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy -from BayesianKDEy.full_experiments import experiment_path +from BayesianKDEy.commons import experiment_path from quapy.protocol import UPP import numpy as np from tqdm import tqdm diff --git a/BayesianKDEy/temperature_calibration.py b/BayesianKDEy/temperature_calibration.py index 6153c05..10ca3a1 100644 --- a/BayesianKDEy/temperature_calibration.py +++ b/BayesianKDEy/temperature_calibration.py @@ -13,7 +13,7 @@ import copy def temp_calibration(method:WithConfidenceABC, train:LabelledCollection, val:LabelledCollection, - temp_grid=[1, 1.5, 2], + temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], num_samples=100, nominal_coverage=0.95, amplitude_threshold='auto', diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index 3e4149e..fec7321 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -3,6 +3,7 @@ import unittest from sklearn.linear_model import LogisticRegression +import BayesianKDEy.commons import quapy as qp from quapy.method.aggregative import ACC from quapy.method.meta import Ensemble @@ -47,7 +48,7 @@ class TestMethods(unittest.TestCase): learner.fit(*dataset.training.Xy) for model in AGGREGATIVE_METHODS: - if not dataset.binary and model in BINARY_METHODS: + if not BayesianKDEy.commons.binary and model in BINARY_METHODS: print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}') continue @@ -61,7 +62,7 @@ class TestMethods(unittest.TestCase): for dataset in TestMethods.datasets: for model in NON_AGGREGATIVE_METHODS: - if not dataset.binary and model in BINARY_METHODS: + if not BayesianKDEy.commons.binary and model in BINARY_METHODS: print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}') continue @@ -76,7 +77,7 @@ class TestMethods(unittest.TestCase): base_quantifier = ACC(LogisticRegression()) for dataset, policy in itertools.product(TestMethods.datasets, Ensemble.VALID_POLICIES): - if not dataset.binary and policy == 'ds': + if not BayesianKDEy.commons.binary and policy == 'ds': print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') continue