From c3fd92efde3431f9c28acc50787ef580240011ac Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 27 Sep 2025 17:41:12 +0200 Subject: [PATCH 01/41] experiment --- KDEyAitchison/commons.py | 52 +++++++++++++++ KDEyAitchison/show_results.py | 34 ++++++++++ KDEyAitchison/ucimulti_experiments.py | 94 +++++++++++++++++++++++++++ quapy/method/_kdey.py | 70 +++++++++++++++++--- 4 files changed, 242 insertions(+), 8 deletions(-) create mode 100644 KDEyAitchison/commons.py create mode 100644 KDEyAitchison/show_results.py create mode 100644 KDEyAitchison/ucimulti_experiments.py diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py new file mode 100644 index 0000000..0beaea0 --- /dev/null +++ b/KDEyAitchison/commons.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd + +from quapy.method.aggregative import EMQ, KDEyML +from sklearn.linear_model import LogisticRegression + +METHODS = ['EMQ', + # 'KDEy-ML', + 'KDEy-MLA' + ] + + +# common hyperparameterss +hyper_LR = { + 'classifier__C': np.logspace(-3, 3, 7), + 'classifier__class_weight': ['balanced', None] +} + +hyper_kde = { + 'bandwidth': np.linspace(0.01, 0.2, 20) +} + +hyper_kde_aitchison = { + 'bandwidth': np.linspace(0.01, 2, 100) +} + +# instances a new quantifier based on a string name +def new_method(method, **lr_kwargs): + lr = LogisticRegression(**lr_kwargs) + + if method == 'KDEy-ML': + param_grid = {**hyper_kde, **hyper_LR} + quantifier = KDEyML(lr, kernel='gaussian') + elif method == 'KDEy-MLA': + param_grid = {**hyper_kde_aitchison, **hyper_LR} + quantifier = KDEyML(lr, kernel='aitchison') + elif method == 'EMQ': + param_grid = hyper_LR + quantifier = EMQ(lr) + else: + raise NotImplementedError('unknown method', method) + + return param_grid, quantifier + + +def show_results(result_path): + df = pd.read_csv(result_path+'.csv', sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) \ No newline at end of file diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py new file mode 100644 index 0000000..db63e64 --- /dev/null +++ b/KDEyAitchison/show_results.py @@ -0,0 +1,34 @@ +import pickle +import os +import sys + +import pandas as pd + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from commons import METHODS, new_method, show_results + + +SEED = 1 + + + +if __name__ == '__main__': + print(qp.datasets.UCI_MULTICLASS_DATASETS) + for optim in ['mae']: + result_dir = f'results/ucimulti/{optim}' + + for method in METHODS: + print() + global_result_path = f'{result_dir}/{method}' + print(f'Method\tDataset\tMAE\tMRAE\tKLD') + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + print(dataset) + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + report = pd.read_csv(local_result_path+'.dataframe') + print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}') + else: + print(dataset, 'not found') + diff --git a/KDEyAitchison/ucimulti_experiments.py b/KDEyAitchison/ucimulti_experiments.py new file mode 100644 index 0000000..b882380 --- /dev/null +++ b/KDEyAitchison/ucimulti_experiments.py @@ -0,0 +1,94 @@ +import pickle +import os +import sys + +import pandas as pd + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from commons import METHODS, new_method, show_results + + +SEED = 1 + + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + for optim in ['mae']: + result_dir = f'results/ucimulti/{optim}' + + os.makedirs(result_dir, exist_ok=True) + + for method in METHODS: + + print('Init method', method) + + global_result_path = f'{result_dir}/{method}' + # show_results(global_result_path) + # sys.exit(0) + + if not os.path.exists(global_result_path + '.csv'): + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + + print('init', dataset) + + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + print(f'result file {local_result_path}.dataframe already exist; skipping') + report = pd.read_csv(local_result_path+'.dataframe') + print(report["mae"].mean()) + # data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + # csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + continue + + with qp.util.temp_seed(SEED): + + param_grid, quantifier = new_method(method, max_iter=3000) + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim + ) + + try: + modsel.fit(*train.Xy) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(*train.Xy) + + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True + ) + report.to_csv(f'{local_result_path}.dataframe') + print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 4208e4d..5932c06 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -11,12 +11,28 @@ import quapy.functional as F from sklearn.metrics.pairwise import rbf_kernel +# class KDE(KernelDensity): +# +# KERNELS = ['gaussian', 'aitchison'] +# +# def __init__(self, bandwidth, kernel): +# assert kernel in KDE.KERNELS, f'unknown {kernel=}' +# self.bandwidth = bandwidth +# self.kernel = kernel +# +# def + + + + class KDEBase: """ Common ancestor for KDE-based methods. Implements some common routines. """ BANDWIDTH_METHOD = ['scott', 'silverman'] + KERNELS = ['gaussian', 'aitchison'] + @classmethod def _check_bandwidth(cls, bandwidth): @@ -30,31 +46,62 @@ class KDEBase: f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): assert 0 < bandwidth < 1, \ - "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + "the bandwidth for KDEy should be in (0,1), since this method models the unit simplex" return bandwidth - def get_kde_function(self, X, bandwidth): + @classmethod + def _check_kernel(cls, kernel): + """ + Checks that the kernel parameter is correct + + :param kernel: str + :return: the validated kernel + """ + assert kernel in KDEBase.KERNELS, f'unknown {kernel=}' + return kernel + + @classmethod + def clr_transform(cls, P, eps=1e-7): + """ + Centered-Log Ratio (CLR) transform. + P: array (n_samples, n_classes), every row is a point in the probability simplex + eps: smoothing, to avoid log(0) + """ + X_safe = np.clip(P, eps, None) + X_safe /= X_safe.sum(axis=1, keepdims=True) # renormalize + gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True)) + return np.log(X_safe / gm) + + def get_kde_function(self, X, bandwidth, kernel): """ Wraps the KDE function from scikit-learn. :param X: data for which the density function is to be estimated :param bandwidth: the bandwidth of the kernel + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: a scikit-learn's KernelDensity object """ + if kernel == 'aitchison': + X = KDEBase.clr_transform(X) + return KernelDensity(bandwidth=bandwidth).fit(X) - def pdf(self, kde, X): + def pdf(self, kde, X, kernel): """ Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this function returns :math:`e^{s}` :param kde: a previously fit KDE function :param X: the data for which the density is to be estimated + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: np.ndarray with the densities """ + if kernel == 'aitchison': + X = KDEBase.clr_transform(X) + return np.exp(kde.score_samples(X)) - def get_mixture_components(self, X, y, classes, bandwidth): + def get_mixture_components(self, X, y, classes, bandwidth, kernel): """ Returns an array containing the mixture components, i.e., the KDE functions for each class. @@ -62,6 +109,7 @@ class KDEBase: :param y: the class labels :param n_classes: integer, the number of classes :param bandwidth: float, the bandwidth of the kernel + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates """ class_cond_X = [] @@ -69,8 +117,12 @@ class KDEBase: selX = X[y==cat] if selX.size==0: selX = [F.uniform_prevalence(len(classes))] + + # if kernel == 'aitchison': + # this is already done within get_kde_function + # selX = KDEBase.clr_transform(selX) class_cond_X.append(selX) - return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] + return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X] class KDEyML(AggregativeSoftQuantifier, KDEBase): @@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. :param bandwidth: float, the bandwidth of the Kernel + :param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS :param random_state: a seed to be set before fitting any base quantifier (default None) """ - def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian', random_state=None): super().__init__(classifier, fit_classifier, val_split) self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.kernel = self._check_kernel(kernel) self.random_state=random_state def aggregation_fit(self, classif_predictions, labels): - self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth) + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel) return self def aggregate(self, posteriors: np.ndarray): @@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): with qp.util.temp_seed(self.random_state): epsilon = 1e-10 n_classes = len(self.mix_densities) - test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities] def neg_loglikelihood(prev): test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) From 29bb261f62ee2b503dabdf2fb1748bd34fe3ac2a Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 27 Sep 2025 17:47:52 +0200 Subject: [PATCH 02/41] testing kde normal --- KDEyAitchison/commons.py | 4 ++-- KDEyAitchison/show_results.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py index 0beaea0..9e7b105 100644 --- a/KDEyAitchison/commons.py +++ b/KDEyAitchison/commons.py @@ -5,7 +5,7 @@ from quapy.method.aggregative import EMQ, KDEyML from sklearn.linear_model import LogisticRegression METHODS = ['EMQ', - # 'KDEy-ML', + 'KDEy-ML', 'KDEy-MLA' ] @@ -17,7 +17,7 @@ hyper_LR = { } hyper_kde = { - 'bandwidth': np.linspace(0.01, 0.2, 20) + 'bandwidth': np.linspace(0.001, 0.5, 100) } hyper_kde_aitchison = { diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py index db63e64..3c18d6b 100644 --- a/KDEyAitchison/show_results.py +++ b/KDEyAitchison/show_results.py @@ -24,7 +24,7 @@ if __name__ == '__main__': global_result_path = f'{result_dir}/{method}' print(f'Method\tDataset\tMAE\tMRAE\tKLD') for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: - print(dataset) + # print(dataset) local_result_path = global_result_path + '_' + dataset if os.path.exists(local_result_path + '.dataframe'): report = pd.read_csv(local_result_path+'.dataframe') From 27afea8bf1f5f24b6901fa0d8c16b1eada597c81 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 27 Sep 2025 18:01:21 +0200 Subject: [PATCH 03/41] with tables in pdf --- KDEyAitchison/commons.py | 10 +++++++--- KDEyAitchison/show_results.py | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py index 9e7b105..d7e20bd 100644 --- a/KDEyAitchison/commons.py +++ b/KDEyAitchison/commons.py @@ -1,11 +1,12 @@ import numpy as np import pandas as pd -from quapy.method.aggregative import EMQ, KDEyML +from quapy.method.aggregative import EMQ, KDEyML, PACC from sklearn.linear_model import LogisticRegression -METHODS = ['EMQ', - 'KDEy-ML', +METHODS = ['PACC', + 'EMQ', + # 'KDEy-ML', 'KDEy-MLA' ] @@ -37,6 +38,9 @@ def new_method(method, **lr_kwargs): elif method == 'EMQ': param_grid = hyper_LR quantifier = EMQ(lr) + elif method == 'PACC': + param_grid = hyper_LR + quantifier = PACC(lr) else: raise NotImplementedError('unknown method', method) diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py index 3c18d6b..a4b41f5 100644 --- a/KDEyAitchison/show_results.py +++ b/KDEyAitchison/show_results.py @@ -8,6 +8,7 @@ import quapy as qp from quapy.model_selection import GridSearchQ from quapy.protocol import UPP from commons import METHODS, new_method, show_results +from new_table import LatexTable SEED = 1 @@ -16,6 +17,7 @@ SEED = 1 if __name__ == '__main__': print(qp.datasets.UCI_MULTICLASS_DATASETS) + table = LatexTable() for optim in ['mae']: result_dir = f'results/ucimulti/{optim}' @@ -29,6 +31,8 @@ if __name__ == '__main__': if os.path.exists(local_result_path + '.dataframe'): report = pd.read_csv(local_result_path+'.dataframe') print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}') + table.add(benchmark=dataset, method=method, v=report["mae"].values) else: print(dataset, 'not found') + table.latexPDF('./tables/mae.pdf', landscape=False) From 606ec2b89c894892b9ff30035c9aab748df7d10c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 27 Sep 2025 18:01:39 +0200 Subject: [PATCH 04/41] with tables in pdf --- KDEyAitchison/show_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py index a4b41f5..3830c15 100644 --- a/KDEyAitchison/show_results.py +++ b/KDEyAitchison/show_results.py @@ -34,5 +34,5 @@ if __name__ == '__main__': table.add(benchmark=dataset, method=method, v=report["mae"].values) else: print(dataset, 'not found') - table.latexPDF('./tables/mae.pdf', landscape=False) + table.latexPDF(f'./tables/{optim}.pdf', landscape=False) From e4cb7868c7cfa49bf8437349b7fa7b49ea3acb22 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 27 Sep 2025 18:03:24 +0200 Subject: [PATCH 05/41] adding mrae --- KDEyAitchison/commons.py | 2 +- KDEyAitchison/ucimulti_experiments.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py index d7e20bd..ce41b42 100644 --- a/KDEyAitchison/commons.py +++ b/KDEyAitchison/commons.py @@ -6,7 +6,7 @@ from sklearn.linear_model import LogisticRegression METHODS = ['PACC', 'EMQ', - # 'KDEy-ML', + 'KDEy-ML', 'KDEy-MLA' ] diff --git a/KDEyAitchison/ucimulti_experiments.py b/KDEyAitchison/ucimulti_experiments.py index b882380..0852edb 100644 --- a/KDEyAitchison/ucimulti_experiments.py +++ b/KDEyAitchison/ucimulti_experiments.py @@ -20,7 +20,7 @@ if __name__ == '__main__': qp.environ['N_JOBS'] = -1 n_bags_val = 250 n_bags_test = 1000 - for optim in ['mae']: + for optim in ['mae', 'mrae']: result_dir = f'results/ucimulti/{optim}' os.makedirs(result_dir, exist_ok=True) From 79f3709e6fe031aa548e0ff5b5b605217ca9b973 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 27 Sep 2025 18:12:56 +0200 Subject: [PATCH 06/41] adding mrae --- KDEyAitchison/show_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py index 3830c15..44ed661 100644 --- a/KDEyAitchison/show_results.py +++ b/KDEyAitchison/show_results.py @@ -33,6 +33,6 @@ if __name__ == '__main__': print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}') table.add(benchmark=dataset, method=method, v=report["mae"].values) else: - print(dataset, 'not found') + print(dataset, 'not found for method', method) table.latexPDF(f'./tables/{optim}.pdf', landscape=False) From bd9f8e2cb454e87dc86942caaa701e1eec9a2cd2 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 30 Sep 2025 12:02:13 +0200 Subject: [PATCH 07/41] show results fix --- KDEyAitchison/commons.py | 2 +- KDEyAitchison/show_results.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py index ce41b42..5e3db50 100644 --- a/KDEyAitchison/commons.py +++ b/KDEyAitchison/commons.py @@ -53,4 +53,4 @@ def show_results(result_path): pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) - print(pv) \ No newline at end of file + print(pv) diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py index 44ed661..d5e867c 100644 --- a/KDEyAitchison/show_results.py +++ b/KDEyAitchison/show_results.py @@ -17,8 +17,8 @@ SEED = 1 if __name__ == '__main__': print(qp.datasets.UCI_MULTICLASS_DATASETS) - table = LatexTable() - for optim in ['mae']: + for optim in ['mae', 'mrae']: + table = LatexTable() result_dir = f'results/ucimulti/{optim}' for method in METHODS: @@ -30,9 +30,9 @@ if __name__ == '__main__': local_result_path = global_result_path + '_' + dataset if os.path.exists(local_result_path + '.dataframe'): report = pd.read_csv(local_result_path+'.dataframe') - print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}') - table.add(benchmark=dataset, method=method, v=report["mae"].values) + print(f'{method}\t{dataset}\t{report[optim].mean():.5f}') + table.add(benchmark=dataset, method=method, v=report[optim].values) else: print(dataset, 'not found for method', method) - table.latexPDF(f'./tables/{optim}.pdf', landscape=False) + table.latexPDF(f'./tables/{optim}.pdf', landscape=False) From 3e7a431d26e047abf7c37dd36a31c4e8c104b368 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 13 Nov 2025 18:43:03 +0100 Subject: [PATCH 08/41] bayeisan kdey --- BayesianKDEy/bayesian_kdey.py | 99 +++++++++++++++++++++++++++++++++++ BayesianKDEy/plot_simplex.py | 94 +++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 BayesianKDEy/bayesian_kdey.py create mode 100644 BayesianKDEy/plot_simplex.py diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py new file mode 100644 index 0000000..228f547 --- /dev/null +++ b/BayesianKDEy/bayesian_kdey.py @@ -0,0 +1,99 @@ +from sklearn.linear_model import LogisticRegression +import quapy as qp +from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm + + +def bayesian(kdes, data, prob_classifier, init=None, MAX_ITER=100_000, warmup=3_000): + """ + Bayes: + P(prev|data) = P(data|prev) P(prev) / P(data) + i.e., + posterior = likelihood * prior / evidence + we assume the likelihood be: + prev @ [kde_i_likelihood(data) 1..i..n] + prior be uniform in simplex + """ + def pdf(kde, X): + # todo: remove exp, since we are then doing the log every time? (not sure) + return np.exp(kde.score_samples(X)) + + X = prob_classifier.predict_proba(data) + test_densities = [pdf(kde_i, X) for kde_i in kdes] + + def log_likelihood(prev, epsilon=1e-8): + # test_likelihoods = sum(prev_i*dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_likelihoods = prev @ test_densities + test_loglikelihood = np.log(test_likelihoods + epsilon) + return np.sum(test_loglikelihood) + + def log_prior(prev): + # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) + return 1 # it is not 1 but we assume uniform, son anyway is an useless constant + + def sample_neighbour(prev): + rand_prev = F.uniform_prevalence_sampling(n_classes=len(prev), size=1) + rand_direction = rand_prev - prev + neighbour = F.normalize_prevalence(prev + rand_direction*0.05) + return neighbour + + n_classes = len(prob_classifier.classes_) + current_prev = F.uniform_prevalence(n_classes) if init is None else init + current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) + + # Metropolis-Hastings + samples = [] + for _ in tqdm(range(MAX_ITER), total=MAX_ITER): + proposed_prev = sample_neighbour(current_prev) + + # probability of acceptance + proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev) + acceptance = proposed_likelihood - current_likelihood + + # decide acceptance + if np.log(np.random.rand()) < acceptance: + # accept + current_prev = proposed_prev + current_likelihood = proposed_likelihood + + samples.append(current_prev) + + # remove "warmup" initial iterations + samples = np.asarray(samples[warmup:]) + return samples + + +if __name__ == '__main__': + qp.environ["SAMPLE_SIZE"] = 500 + cls = LogisticRegression() + kdey = KDEyML(cls) + + train, test = qp.datasets.fetch_UCIMulticlassDataset('waveform-v1', standardize=True).train_test + + with qp.util.temp_seed(0): + print('fitting KDEy') + kdey.fit(*train.Xy) + + shifted = test.sampling(500, *[0.1, 0.1, 0.8]) + prev_hat = kdey.predict(shifted.X) + mae = qp.error.mae(shifted.prevalence(), prev_hat) + print(f'true_prev={strprev(shifted.prevalence())}, prev_hat={strprev(prev_hat)}, {mae=:.4f}') + + kdes = kdey.mix_densities + h = kdey.classifier + samples = bayesian(kdes, shifted.X, h, init=prev_hat, MAX_ITER=100_000, warmup=0) + + print(f'mean posterior {strprev(samples.mean(axis=0))}') + + plot_prev_points(samples, true_prev=shifted.prevalence()) + + + # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) + # print(report.mean(numeric_only=True)) + + diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py new file mode 100644 index 0000000..ca94ae1 --- /dev/null +++ b/BayesianKDEy/plot_simplex.py @@ -0,0 +1,94 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.stats import gaussian_kde + + +def plot_prev_points(prevs, true_prev): + + def cartesian(p): + dim = p.shape[-1] + p = p.reshape(-1,dim) + x = p[:, 1] + p[:, 2] * 0.5 + y = p[:, 2] * np.sqrt(3) / 2 + return x, y + + # simplex coordinates + v1 = np.array([0, 0]) + v2 = np.array([1, 0]) + v3 = np.array([0.5, np.sqrt(3)/2]) + + # transform (a,b,c) -> Cartesian coordinates + x, y = cartesian(prevs) + + # Plot + fig, ax = plt.subplots(figsize=(6, 6)) + ax.scatter(x, y, s=50, alpha=0.05, edgecolors='none') + ax.scatter(*cartesian(true_prev), s=5, alpha=1) + + # edges + triangle = np.array([v1, v2, v3, v1]) + ax.plot(triangle[:, 0], triangle[:, 1], color='black') + + # vertex labels + ax.text(-0.05, -0.05, "y=0", ha='right', va='top') + ax.text(1.05, -0.05, "y=1", ha='left', va='top') + ax.text(0.5, np.sqrt(3)/2 + 0.05, "y=2", ha='center', va='bottom') + + ax.set_aspect('equal') + ax.axis('off') + plt.show() + + +def plot_prev_points_matplot(points): + + # project 2D + v1 = np.array([0, 0]) + v2 = np.array([1, 0]) + v3 = np.array([0.5, np.sqrt(3) / 2]) + x = points[:, 1] + points[:, 2] * 0.5 + y = points[:, 2] * np.sqrt(3) / 2 + + # kde + xy = np.vstack([x, y]) + kde = gaussian_kde(xy) + xmin, xmax = 0, 1 + ymin, ymax = 0, np.sqrt(3) / 2 + + # grid + xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j] + positions = np.vstack([xx.ravel(), yy.ravel()]) + zz = np.reshape(kde(positions).T, xx.shape) + + # mask points in simplex + def in_triangle(x, y): + return (y >= 0) & (y <= np.sqrt(3) * np.minimum(x, 1 - x)) + + mask = in_triangle(xx, yy) + zz_masked = np.ma.array(zz, mask=~mask) + + # plot + fig, ax = plt.subplots(figsize=(6, 6)) + ax.imshow( + np.rot90(zz_masked), + cmap=plt.cm.viridis, + extent=[xmin, xmax, ymin, ymax], + alpha=0.8, + ) + + # Bordes del triángulo + triangle = np.array([v1, v2, v3, v1]) + ax.plot(triangle[:, 0], triangle[:, 1], color='black', lw=2) + + # Puntos (opcional) + ax.scatter(x, y, s=5, c='white', alpha=0.3) + + # Etiquetas + ax.text(-0.05, -0.05, "A (1,0,0)", ha='right', va='top') + ax.text(1.05, -0.05, "B (0,1,0)", ha='left', va='top') + ax.text(0.5, np.sqrt(3) / 2 + 0.05, "C (0,0,1)", ha='center', va='bottom') + + ax.set_aspect('equal') + ax.axis('off') + plt.show() + + From 400edfdb631a3dfb4ada9c66dc363437281cab07 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 13 Nov 2025 19:43:07 +0100 Subject: [PATCH 09/41] bayesian plot --- BayesianKDEy/bayesian_kdey.py | 39 +++++++++++++++++++++-------------- BayesianKDEy/plot_simplex.py | 34 +++++++++++++++++++++++------- quapy/method/_kdey.py | 3 ++- 3 files changed, 52 insertions(+), 24 deletions(-) diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py index 228f547..419b974 100644 --- a/BayesianKDEy/bayesian_kdey.py +++ b/BayesianKDEy/bayesian_kdey.py @@ -7,9 +7,10 @@ from quapy.protocol import UPP import quapy.functional as F import numpy as np from tqdm import tqdm +from scipy.stats import dirichlet -def bayesian(kdes, data, prob_classifier, init=None, MAX_ITER=100_000, warmup=3_000): +def bayesian(kdes, data, probabilistic_classifier, init=None, MAX_ITER=100_000, warmup=3_000): """ Bayes: P(prev|data) = P(data|prev) P(prev) / P(data) @@ -23,26 +24,32 @@ def bayesian(kdes, data, prob_classifier, init=None, MAX_ITER=100_000, warmup=3_ # todo: remove exp, since we are then doing the log every time? (not sure) return np.exp(kde.score_samples(X)) - X = prob_classifier.predict_proba(data) - test_densities = [pdf(kde_i, X) for kde_i in kdes] + X = probabilistic_classifier.predict_proba(data) + test_densities = np.asarray([pdf(kde_i, X) for kde_i in kdes]) - def log_likelihood(prev, epsilon=1e-8): - # test_likelihoods = sum(prev_i*dens_i for prev_i, dens_i in zip (prev, test_densities)) + def log_likelihood(prev, epsilon=1e-10): test_likelihoods = prev @ test_densities test_loglikelihood = np.log(test_likelihoods + epsilon) return np.sum(test_loglikelihood) - def log_prior(prev): + # def log_prior(prev): # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) - return 1 # it is not 1 but we assume uniform, son anyway is an useless constant + # return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant + + # def log_prior(prev, alpha_scale=1000): + # alpha = np.array(init) * alpha_scale + # return dirichlet.logpdf(prev, alpha) + + def log_prior(prev): + return 0 def sample_neighbour(prev): - rand_prev = F.uniform_prevalence_sampling(n_classes=len(prev), size=1) - rand_direction = rand_prev - prev - neighbour = F.normalize_prevalence(prev + rand_direction*0.05) + dir_noise = np.random.normal(scale=0.05, size=len(prev)) + # neighbour = F.normalize_prevalence(prev + dir_noise, method='clip') + neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') return neighbour - n_classes = len(prob_classifier.classes_) + n_classes = len(probabilistic_classifier.classes_) current_prev = F.uniform_prevalence(n_classes) if init is None else init current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) @@ -74,23 +81,25 @@ if __name__ == '__main__': kdey = KDEyML(cls) train, test = qp.datasets.fetch_UCIMulticlassDataset('waveform-v1', standardize=True).train_test + # train, test = qp.datasets.fetch_UCIMulticlassDataset('phishing', standardize=True).train_test - with qp.util.temp_seed(0): + with qp.util.temp_seed(2): print('fitting KDEy') kdey.fit(*train.Xy) - shifted = test.sampling(500, *[0.1, 0.1, 0.8]) + shifted = test.sampling(500, *[0.7, 0.1, 0.2]) prev_hat = kdey.predict(shifted.X) mae = qp.error.mae(shifted.prevalence(), prev_hat) print(f'true_prev={strprev(shifted.prevalence())}, prev_hat={strprev(prev_hat)}, {mae=:.4f}') kdes = kdey.mix_densities h = kdey.classifier - samples = bayesian(kdes, shifted.X, h, init=prev_hat, MAX_ITER=100_000, warmup=0) + samples = bayesian(kdes, shifted.X, h, init=None, MAX_ITER=5_000, warmup=1_000) print(f'mean posterior {strprev(samples.mean(axis=0))}') - plot_prev_points(samples, true_prev=shifted.prevalence()) + plot_prev_points(samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) + # plot_prev_points_matplot(samples) # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index ca94ae1..4d3044b 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -3,7 +3,15 @@ import matplotlib.pyplot as plt from scipy.stats import gaussian_kde -def plot_prev_points(prevs, true_prev): +def plot_prev_points(prevs, true_prev, point_estim, train_prev): + plt.rcParams.update({ + 'font.size': 10, # tamaño base de todo el texto + 'axes.titlesize': 12, # título del eje + 'axes.labelsize': 10, # etiquetas de ejes + 'xtick.labelsize': 8, # etiquetas de ticks + 'ytick.labelsize': 8, + 'legend.fontsize': 9, # leyenda + }) def cartesian(p): dim = p.shape[-1] @@ -17,13 +25,13 @@ def plot_prev_points(prevs, true_prev): v2 = np.array([1, 0]) v3 = np.array([0.5, np.sqrt(3)/2]) - # transform (a,b,c) -> Cartesian coordinates - x, y = cartesian(prevs) - # Plot fig, ax = plt.subplots(figsize=(6, 6)) - ax.scatter(x, y, s=50, alpha=0.05, edgecolors='none') - ax.scatter(*cartesian(true_prev), s=5, alpha=1) + ax.scatter(*cartesian(prevs), s=50, alpha=0.05, edgecolors='none', label='samples') + ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') + ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') + ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') + ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black') # edges triangle = np.array([v1, v2, v3, v1]) @@ -36,6 +44,13 @@ def plot_prev_points(prevs, true_prev): ax.set_aspect('equal') ax.axis('off') + plt.legend( + loc='center left', + bbox_to_anchor=(1.05, 0.5), + # ncol=3, + # frameon=False + ) + plt.tight_layout() plt.show() @@ -50,7 +65,7 @@ def plot_prev_points_matplot(points): # kde xy = np.vstack([x, y]) - kde = gaussian_kde(xy) + kde = gaussian_kde(xy, bw_method=0.25) xmin, xmax = 0, 1 ymin, ymax = 0, np.sqrt(3) / 2 @@ -91,4 +106,7 @@ def plot_prev_points_matplot(points): ax.axis('off') plt.show() - +if __name__ == '__main__': + n = 1000 + points = np.random.dirichlet([2, 3, 4], size=n) + plot_prev_points_matplot(points) diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 5932c06..f941e30 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -190,7 +190,8 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities] def neg_loglikelihood(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + # test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_mixture_likelihood = prev @ test_densities test_loglikelihood = np.log(test_mixture_likelihood + epsilon) return -np.sum(test_loglikelihood) From 7ad5311fac7fe0ec529b2eef0e1e17899749f5a4 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 14 Nov 2025 12:07:32 +0100 Subject: [PATCH 10/41] merging from devel --- BayesianKDEy/bayesian_kdey.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py index 419b974..11aba80 100644 --- a/BayesianKDEy/bayesian_kdey.py +++ b/BayesianKDEy/bayesian_kdey.py @@ -1,6 +1,7 @@ from sklearn.linear_model import LogisticRegression import quapy as qp from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from method.confidence import ConfidenceIntervals from quapy.functional import strprev from quapy.method.aggregative import KDEyML from quapy.protocol import UPP @@ -80,14 +81,14 @@ if __name__ == '__main__': cls = LogisticRegression() kdey = KDEyML(cls) - train, test = qp.datasets.fetch_UCIMulticlassDataset('waveform-v1', standardize=True).train_test - # train, test = qp.datasets.fetch_UCIMulticlassDataset('phishing', standardize=True).train_test + train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean', standardize=True).train_test with qp.util.temp_seed(2): print('fitting KDEy') kdey.fit(*train.Xy) - shifted = test.sampling(500, *[0.7, 0.1, 0.2]) + # shifted = test.sampling(500, *[0.7, 0.1, 0.2]) + shifted = test.sampling(500, *test.prevalence()[::-1]) prev_hat = kdey.predict(shifted.X) mae = qp.error.mae(shifted.prevalence(), prev_hat) print(f'true_prev={strprev(shifted.prevalence())}, prev_hat={strprev(prev_hat)}, {mae=:.4f}') @@ -97,9 +98,12 @@ if __name__ == '__main__': samples = bayesian(kdes, shifted.X, h, init=None, MAX_ITER=5_000, warmup=1_000) print(f'mean posterior {strprev(samples.mean(axis=0))}') + conf_interval = ConfidenceIntervals(samples, confidence_level=0.95) + print() - plot_prev_points(samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) - # plot_prev_points_matplot(samples) + if train.n_classes == 3: + plot_prev_points(samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) + # plot_prev_points_matplot(samples) # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) From ccb634fae5edcb2f16573d8a3abdeb757fde0f8f Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 14 Nov 2025 16:09:34 +0100 Subject: [PATCH 11/41] step rate adaptation --- BayesianKDEy/bayesian_kdey.py | 49 +++++++++++++++++++++++------------ BayesianKDEy/plot_simplex.py | 2 +- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py index 11aba80..573b86b 100644 --- a/BayesianKDEy/bayesian_kdey.py +++ b/BayesianKDEy/bayesian_kdey.py @@ -11,7 +11,7 @@ from tqdm import tqdm from scipy.stats import dirichlet -def bayesian(kdes, data, probabilistic_classifier, init=None, MAX_ITER=100_000, warmup=3_000): +def bayesian(kdey, data, probabilistic_classifier, init=None, MAX_ITER=100_000, warmup=3_000): """ Bayes: P(prev|data) = P(data|prev) P(prev) / P(data) @@ -26,6 +26,7 @@ def bayesian(kdes, data, probabilistic_classifier, init=None, MAX_ITER=100_000, return np.exp(kde.score_samples(X)) X = probabilistic_classifier.predict_proba(data) + kdes = kdey.mix_densities test_densities = np.asarray([pdf(kde_i, X) for kde_i in kdes]) def log_likelihood(prev, epsilon=1e-10): @@ -44,9 +45,9 @@ def bayesian(kdes, data, probabilistic_classifier, init=None, MAX_ITER=100_000, def log_prior(prev): return 0 - def sample_neighbour(prev): - dir_noise = np.random.normal(scale=0.05, size=len(prev)) - # neighbour = F.normalize_prevalence(prev + dir_noise, method='clip') + def sample_neighbour(prev, step_size=0.05): + # random-walk Metropolis-Hastings + dir_noise = np.random.normal(scale=step_size, size=len(prev)) neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') return neighbour @@ -54,22 +55,33 @@ def bayesian(kdes, data, probabilistic_classifier, init=None, MAX_ITER=100_000, current_prev = F.uniform_prevalence(n_classes) if init is None else init current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) - # Metropolis-Hastings + # Metropolis-Hastings with adaptive rate + step_size = 0.05 + target_acceptance = 0.3 + adapt_rate = 0.01 + acceptance_history = [] + samples = [] - for _ in tqdm(range(MAX_ITER), total=MAX_ITER): - proposed_prev = sample_neighbour(current_prev) + for i in tqdm(range(MAX_ITER), total=MAX_ITER): + proposed_prev = sample_neighbour(current_prev, step_size) # probability of acceptance proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev) acceptance = proposed_likelihood - current_likelihood # decide acceptance - if np.log(np.random.rand()) < acceptance: - # accept + accepted = np.log(np.random.rand()) < acceptance + if accepted: current_prev = proposed_prev current_likelihood = proposed_likelihood samples.append(current_prev) + acceptance_history.append(1. if accepted else 0.) + + if i < warmup and i%10==0 and len(acceptance_history)>=100: + recent_accept_rate = np.mean(acceptance_history[-100:]) + # print(f'{i=} recent_accept_rate={recent_accept_rate:.4f} (current step_size={step_size:.4f})') + step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) # remove "warmup" initial iterations samples = np.asarray(samples[warmup:]) @@ -81,31 +93,34 @@ if __name__ == '__main__': cls = LogisticRegression() kdey = KDEyML(cls) - train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean', standardize=True).train_test + train, test = qp.datasets.fetch_UCIMulticlassDataset('academic-success', standardize=True).train_test with qp.util.temp_seed(2): print('fitting KDEy') kdey.fit(*train.Xy) # shifted = test.sampling(500, *[0.7, 0.1, 0.2]) - shifted = test.sampling(500, *test.prevalence()[::-1]) + # shifted = test.sampling(500, *test.prevalence()[::-1]) + shifted = test.sampling(500, *F.uniform_prevalence_sampling(train.n_classes)) prev_hat = kdey.predict(shifted.X) mae = qp.error.mae(shifted.prevalence(), prev_hat) - print(f'true_prev={strprev(shifted.prevalence())}, prev_hat={strprev(prev_hat)}, {mae=:.4f}') + print(f'true_prev={strprev(shifted.prevalence())}') + print(f'prev_hat={strprev(prev_hat)}, {mae=:.4f}') - kdes = kdey.mix_densities h = kdey.classifier - samples = bayesian(kdes, shifted.X, h, init=None, MAX_ITER=5_000, warmup=1_000) + samples = bayesian(kdey, shifted.X, h, init=None, MAX_ITER=5_000, warmup=3_000) - print(f'mean posterior {strprev(samples.mean(axis=0))}') conf_interval = ConfidenceIntervals(samples, confidence_level=0.95) - print() + mae = qp.error.mae(shifted.prevalence(), conf_interval.point_estimate()) + print(f'mean posterior {strprev(samples.mean(axis=0))}, {mae=:.4f}') + print(f'CI={conf_interval}') + print(f'\tcontains true={conf_interval.coverage(true_value=shifted.prevalence())==1}') + print(f'\tamplitude={conf_interval.montecarlo_proportion(50_000)*100.:.20f}%') if train.n_classes == 3: plot_prev_points(samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) # plot_prev_points_matplot(samples) - # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) # print(report.mean(numeric_only=True)) diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index 4d3044b..515db3f 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -27,7 +27,7 @@ def plot_prev_points(prevs, true_prev, point_estim, train_prev): # Plot fig, ax = plt.subplots(figsize=(6, 6)) - ax.scatter(*cartesian(prevs), s=50, alpha=0.05, edgecolors='none', label='samples') + ax.scatter(*cartesian(prevs), s=10, alpha=0.5, edgecolors='none', label='samples') ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') From 4255098ba77365b6afc52ca2bbfd63fbbe7b3fb8 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 15 Nov 2025 21:47:30 +0100 Subject: [PATCH 12/41] creating class BayesianKDEy --- BayesianKDEy/_bayeisan_kdey.py | 148 ++++++++++++++++++ BayesianKDEy/bayesian_kdey.py | 127 --------------- .../quick_experiment_bayesian_kdey.py | 56 +++++++ 3 files changed, 204 insertions(+), 127 deletions(-) create mode 100644 BayesianKDEy/_bayeisan_kdey.py delete mode 100644 BayesianKDEy/bayesian_kdey.py create mode 100644 BayesianKDEy/quick_experiment_bayesian_kdey.py diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py new file mode 100644 index 0000000..d586bef --- /dev/null +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -0,0 +1,148 @@ +from sklearn.base import BaseEstimator +import numpy as np +from quapy.method._kdey import KDEBase +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC +from quapy.method.aggregative import AggregativeSoftQuantifier +from tqdm import tqdm +import quapy.functional as F + + +class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): + """ + `Bayesian version of KDEy. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. + :param num_warmup: number of warmup iterations for the MCMC sampler (default 500) + :param num_samples: number of samples to draw from the posterior (default 1000) + :param mcmc_seed: random seed for the MCMC sampler (default 0) + :param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95) + :param region: string, set to `intervals` for constructing confidence intervals (default), or to + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. + :param verbose: bool, whether to display progress bar + """ + def __init__(self, + classifier: BaseEstimator=None, + fit_classifier=True, + val_split: int = 5, + kernel='gaussian', + bandwidth=0.1, + num_warmup: int = 500, + num_samples: int = 1_000, + mcmc_seed: int = 0, + confidence_level: float = 0.95, + region: str = 'intervals', + verbose: bool = False): + + if num_warmup <= 0: + raise ValueError(f'parameter {num_warmup=} must be a positive integer') + if num_samples <= 0: + raise ValueError(f'parameter {num_samples=} must be a positive integer') + + super().__init__(classifier, fit_classifier, val_split) + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.kernel = self._check_kernel(kernel) + self.num_warmup = num_warmup + self.num_samples = num_samples + self.mcmc_seed = mcmc_seed + self.confidence_level = confidence_level + self.region = region + self.verbose = verbose + + def aggregation_fit(self, classif_predictions, labels): + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel) + return self + + def aggregate(self, classif_predictions): + self.prevalence_samples = self._bayesian_kde(classif_predictions, init=None, verbose=self.verbose) + return self.prevalence_samples.mean(axis=0) + + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + if confidence_level is None: + confidence_level = self.confidence_level + classif_predictions = self.classify(instances) + point_estimate = self.aggregate(classif_predictions) + samples = self.prevalence_samples # available after calling "aggregate" function + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) + return point_estimate, region + + def _bayesian_kde(self, X_probs, init=None, verbose=False): + """ + Bayes: + P(prev|data) = P(data|prev) P(prev) / P(data) + i.e., + posterior = likelihood * prior / evidence + we assume the likelihood be: + prev @ [kde_i_likelihood(data) 1..i..n] + prior be uniform in simplex + """ + + rng = np.random.default_rng(self.mcmc_seed) + kdes = self.mix_densities + test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes]) + + def log_likelihood(prev, epsilon=1e-10): + test_likelihoods = prev @ test_densities + test_loglikelihood = np.log(test_likelihoods + epsilon) + return np.sum(test_loglikelihood) + + # def log_prior(prev): + # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) + # return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant + + # def log_prior(prev, alpha_scale=1000): + # alpha = np.array(init) * alpha_scale + # return dirichlet.logpdf(prev, alpha) + + def log_prior(prev): + return 0 + + def sample_neighbour(prev, step_size=0.05): + # random-walk Metropolis-Hastings + dir_noise = rng.normal(scale=step_size, size=len(prev)) + neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') + return neighbour + + n_classes = X_probs.shape[1] + current_prev = F.uniform_prevalence(n_classes) if init is None else init + current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) + + # Metropolis-Hastings with adaptive rate + step_size = 0.05 + target_acceptance = 0.3 + adapt_rate = 0.01 + acceptance_history = [] + + samples = [] + total_steps = self.num_samples + self.num_warmup + for i in tqdm(range(total_steps), total=total_steps, disable=not verbose): + proposed_prev = sample_neighbour(current_prev, step_size) + + # probability of acceptance + proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev) + acceptance = proposed_likelihood - current_likelihood + + # decide acceptance + accepted = np.log(rng.random()) < acceptance + if accepted: + current_prev = proposed_prev + current_likelihood = proposed_likelihood + + samples.append(current_prev) + acceptance_history.append(1. if accepted else 0.) + + if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: + recent_accept_rate = np.mean(acceptance_history[-100:]) + step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) + + # remove "warmup" initial iterations + samples = np.asarray(samples[self.num_warmup:]) + return samples \ No newline at end of file diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py deleted file mode 100644 index 573b86b..0000000 --- a/BayesianKDEy/bayesian_kdey.py +++ /dev/null @@ -1,127 +0,0 @@ -from sklearn.linear_model import LogisticRegression -import quapy as qp -from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot -from method.confidence import ConfidenceIntervals -from quapy.functional import strprev -from quapy.method.aggregative import KDEyML -from quapy.protocol import UPP -import quapy.functional as F -import numpy as np -from tqdm import tqdm -from scipy.stats import dirichlet - - -def bayesian(kdey, data, probabilistic_classifier, init=None, MAX_ITER=100_000, warmup=3_000): - """ - Bayes: - P(prev|data) = P(data|prev) P(prev) / P(data) - i.e., - posterior = likelihood * prior / evidence - we assume the likelihood be: - prev @ [kde_i_likelihood(data) 1..i..n] - prior be uniform in simplex - """ - def pdf(kde, X): - # todo: remove exp, since we are then doing the log every time? (not sure) - return np.exp(kde.score_samples(X)) - - X = probabilistic_classifier.predict_proba(data) - kdes = kdey.mix_densities - test_densities = np.asarray([pdf(kde_i, X) for kde_i in kdes]) - - def log_likelihood(prev, epsilon=1e-10): - test_likelihoods = prev @ test_densities - test_loglikelihood = np.log(test_likelihoods + epsilon) - return np.sum(test_loglikelihood) - - # def log_prior(prev): - # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) - # return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant - - # def log_prior(prev, alpha_scale=1000): - # alpha = np.array(init) * alpha_scale - # return dirichlet.logpdf(prev, alpha) - - def log_prior(prev): - return 0 - - def sample_neighbour(prev, step_size=0.05): - # random-walk Metropolis-Hastings - dir_noise = np.random.normal(scale=step_size, size=len(prev)) - neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') - return neighbour - - n_classes = len(probabilistic_classifier.classes_) - current_prev = F.uniform_prevalence(n_classes) if init is None else init - current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) - - # Metropolis-Hastings with adaptive rate - step_size = 0.05 - target_acceptance = 0.3 - adapt_rate = 0.01 - acceptance_history = [] - - samples = [] - for i in tqdm(range(MAX_ITER), total=MAX_ITER): - proposed_prev = sample_neighbour(current_prev, step_size) - - # probability of acceptance - proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev) - acceptance = proposed_likelihood - current_likelihood - - # decide acceptance - accepted = np.log(np.random.rand()) < acceptance - if accepted: - current_prev = proposed_prev - current_likelihood = proposed_likelihood - - samples.append(current_prev) - acceptance_history.append(1. if accepted else 0.) - - if i < warmup and i%10==0 and len(acceptance_history)>=100: - recent_accept_rate = np.mean(acceptance_history[-100:]) - # print(f'{i=} recent_accept_rate={recent_accept_rate:.4f} (current step_size={step_size:.4f})') - step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) - - # remove "warmup" initial iterations - samples = np.asarray(samples[warmup:]) - return samples - - -if __name__ == '__main__': - qp.environ["SAMPLE_SIZE"] = 500 - cls = LogisticRegression() - kdey = KDEyML(cls) - - train, test = qp.datasets.fetch_UCIMulticlassDataset('academic-success', standardize=True).train_test - - with qp.util.temp_seed(2): - print('fitting KDEy') - kdey.fit(*train.Xy) - - # shifted = test.sampling(500, *[0.7, 0.1, 0.2]) - # shifted = test.sampling(500, *test.prevalence()[::-1]) - shifted = test.sampling(500, *F.uniform_prevalence_sampling(train.n_classes)) - prev_hat = kdey.predict(shifted.X) - mae = qp.error.mae(shifted.prevalence(), prev_hat) - print(f'true_prev={strprev(shifted.prevalence())}') - print(f'prev_hat={strprev(prev_hat)}, {mae=:.4f}') - - h = kdey.classifier - samples = bayesian(kdey, shifted.X, h, init=None, MAX_ITER=5_000, warmup=3_000) - - conf_interval = ConfidenceIntervals(samples, confidence_level=0.95) - mae = qp.error.mae(shifted.prevalence(), conf_interval.point_estimate()) - print(f'mean posterior {strprev(samples.mean(axis=0))}, {mae=:.4f}') - print(f'CI={conf_interval}') - print(f'\tcontains true={conf_interval.coverage(true_value=shifted.prevalence())==1}') - print(f'\tamplitude={conf_interval.montecarlo_proportion(50_000)*100.:.20f}%') - - if train.n_classes == 3: - plot_prev_points(samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) - # plot_prev_points_matplot(samples) - - # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) - # print(report.mean(numeric_only=True)) - - diff --git a/BayesianKDEy/quick_experiment_bayesian_kdey.py b/BayesianKDEy/quick_experiment_bayesian_kdey.py new file mode 100644 index 0000000..b63665c --- /dev/null +++ b/BayesianKDEy/quick_experiment_bayesian_kdey.py @@ -0,0 +1,56 @@ +import warnings + +from sklearn.linear_model import LogisticRegression +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from method.confidence import ConfidenceIntervals +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet + + + + +if __name__ == '__main__': + qp.environ["SAMPLE_SIZE"] = 500 + cls = LogisticRegression() + bayes_kdey = BayesianKDEy(cls, bandwidth=.3, kernel='aitchison', mcmc_seed=0) + + datasets = qp.datasets.UCI_BINARY_DATASETS + train, test = qp.datasets.fetch_UCIBinaryDataset(datasets[0]).train_test + + # train, test = qp.datasets.fetch_UCIMulticlassDataset('academic-success', standardize=True).train_test + + with qp.util.temp_seed(0): + print('fitting KDEy') + bayes_kdey.fit(*train.Xy) + + shifted = test.sampling(500, *[0.2, 0.8]) + # shifted = test.sampling(500, *test.prevalence()[::-1]) + # shifted = test.sampling(500, *F.uniform_prevalence_sampling(train.n_classes)) + prev_hat = bayes_kdey.predict(shifted.X) + mae = qp.error.mae(shifted.prevalence(), prev_hat) + print(f'true_prev={strprev(shifted.prevalence())}') + print(f'prev_hat={strprev(prev_hat)}, {mae=:.4f}') + + prev_hat, conf_interval = bayes_kdey.predict_conf(shifted.X) + + mae = qp.error.mae(shifted.prevalence(), prev_hat) + print(f'mean posterior {strprev(prev_hat)}, {mae=:.4f}') + print(f'CI={conf_interval}') + print(f'\tcontains true={conf_interval.coverage(true_value=shifted.prevalence())==1}') + print(f'\tamplitude={conf_interval.montecarlo_proportion(50_000)*100.:.3f}%') + + if train.n_classes == 3: + plot_prev_points(bayes_kdey.prevalence_samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) + # plot_prev_points_matplot(samples) + + # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) + # print(report.mean(numeric_only=True)) + + From 2f83a520c7773f7bbe8fcb16fe19f0a1ee6c369b Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sun, 16 Nov 2025 12:42:26 +0100 Subject: [PATCH 13/41] drafting experiments --- BayesianKDEy/full_experiments.py | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 BayesianKDEy/full_experiments.py diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py new file mode 100644 index 0000000..ec5acaa --- /dev/null +++ b/BayesianKDEy/full_experiments.py @@ -0,0 +1,75 @@ +import warnings + +from sklearn.linear_model import LogisticRegression +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from method.aggregative import AggregativeQuantifier +from quapy.model_selection import GridSearchQ +from quapy.data import Dataset +# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet + + +def new_classifier(): + lr_hyper = { + 'classifier__C': np.logspace(-3,3,7), + 'classifier__class_weight': ['balanced', None] + } + lr = LogisticRegression() + return lr, lr_hyper + +def methods(): + cls, cls_hyper = new_classifier() + # yield 'BayesianACC', BayesianCC(cls, mcmc_seed=0), cls_hyper + # yield 'BayesianHDy', PQ(cls, stan_seed=0), {**cls_hyper, 'n_bins': [3,4,5,8,16,32]} + yield 'BayesianKDEy', BayesianKDEy(cls, mcmc_seed=0), {**cls_hyper, 'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + + +def experiment(dataset: Dataset, method: AggregativeQuantifier, method_name: str, grid: dict): + with qp.util.temp_seed(0): + # model selection + train, test = dataset.train_test + train, val = train.split_stratified(train_prop=0.6, random_state=0) + mod_sel = GridSearchQ( + model=method, + param_grid=grid, + protocol=qp.protocol.UPP(val, repeats=250, random_state=0), + refit=True, + n_jobs=-1, + verbose=True + ).fit(*train.Xy) + optim_quantifier = mod_sel.best_model() + optim_hyper = mod_sel.best_params_ + print(f'model_selection for {method_name} ended: chosen hyper-params {optim_hyper}') + + # test + report = qp.evaluation.evaluation_report( + optim_quantifier, + protocol=UPP(test, repeats=500, random_state=0), + verbose=True + ) + + return report + + + +if __name__ == '__main__': + qp.environ["SAMPLE_SIZE"] = 500 + + datasets = qp.datasets.UCI_BINARY_DATASETS + for dataset in datasets: + data = qp.datasets.fetch_UCIBinaryDataset(dataset) + for method_name, method, hyper_params in methods(): + report = experiment(data, method, method_name, hyper_params) + print(f'{method_name=} got {report.mean(numeric_only=True)}') + + + + From 12b431ef4bf9ab291c8dc790561d426b38862567 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 17 Nov 2025 12:22:40 +0100 Subject: [PATCH 14/41] scripting experiments binary and multiclass --- BayesianKDEy/full_experiments.py | 128 +++++++++++++++++++++++-------- BayesianKDEy/generate_results.py | 31 ++++++++ quapy/method/confidence.py | 2 +- 3 files changed, 126 insertions(+), 35 deletions(-) create mode 100644 BayesianKDEy/generate_results.py diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index ec5acaa..2fd7080 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -1,13 +1,16 @@ +import os import warnings +from os.path import join +from pathlib import Path from sklearn.linear_model import LogisticRegression import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy -from method.aggregative import AggregativeQuantifier +from quapy.method.base import BinaryQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot -from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC from quapy.functional import strprev from quapy.method.aggregative import KDEyML from quapy.protocol import UPP @@ -15,60 +18,117 @@ import quapy.functional as F import numpy as np from tqdm import tqdm from scipy.stats import dirichlet +from collections import defaultdict +from time import time +from sklearn.base import clone def new_classifier(): - lr_hyper = { - 'classifier__C': np.logspace(-3,3,7), - 'classifier__class_weight': ['balanced', None] - } + # lr_hyper = { + # 'classifier__C': np.logspace(-3,3,7), + # 'classifier__class_weight': ['balanced', None] + # } + lr_hyper = {} lr = LogisticRegression() return lr, lr_hyper def methods(): cls, cls_hyper = new_classifier() - # yield 'BayesianACC', BayesianCC(cls, mcmc_seed=0), cls_hyper - # yield 'BayesianHDy', PQ(cls, stan_seed=0), {**cls_hyper, 'n_bins': [3,4,5,8,16,32]} - yield 'BayesianKDEy', BayesianKDEy(cls, mcmc_seed=0), {**cls_hyper, 'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + # yield 'BayesianACC', BayesianCC(clone(cls), mcmc_seed=0), cls_hyper + # yield 'BayesianHDy', PQ(clone(cls), stan_seed=0), {**cls_hyper, 'n_bins': [3,4,5,8,16,32]} + yield 'BayesianKDEy', BayesianKDEy(clone(cls), mcmc_seed=0), {**cls_hyper, 'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} -def experiment(dataset: Dataset, method: AggregativeQuantifier, method_name: str, grid: dict): +def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): with qp.util.temp_seed(0): # model selection train, test = dataset.train_test - train, val = train.split_stratified(train_prop=0.6, random_state=0) - mod_sel = GridSearchQ( - model=method, - param_grid=grid, - protocol=qp.protocol.UPP(val, repeats=250, random_state=0), - refit=True, - n_jobs=-1, - verbose=True - ).fit(*train.Xy) - optim_quantifier = mod_sel.best_model() - optim_hyper = mod_sel.best_params_ - print(f'model_selection for {method_name} ended: chosen hyper-params {optim_hyper}') + train_prevalence = train.prevalence() + if len(grid)>0: + train, val = train.split_stratified(train_prop=0.6, random_state=0) + mod_sel = GridSearchQ( + model=method, + param_grid=grid, + protocol=qp.protocol.UPP(val, repeats=250, random_state=0), + refit=True, + n_jobs=-1, + verbose=True + ).fit(*train.Xy) + optim_quantifier = mod_sel.best_model() + best_params = mod_sel.best_params_ + best_score = mod_sel.best_score_ + tr_time = mod_sel.refit_time_ + else: + t_init = time() + method.fit(*train.Xy) + tr_time = time() - t_init + best_params, best_score = {}, -1 + optim_quantifier = method # test - report = qp.evaluation.evaluation_report( - optim_quantifier, - protocol=UPP(test, repeats=500, random_state=0), - verbose=True - ) + results = defaultdict(list) + test_generator = UPP(test, repeats=500, random_state=0) + for i, (sample_X, true_prevalence) in enumerate(test_generator()): + t_init = time() + point_estimate, region = optim_quantifier.predict_conf(sample_X) + ttime = time()-t_init + results['true-prevs'].append(true_prevalence) + results['point-estim'].append(point_estimate) + results['shift'].append(qp.error.ae(true_prevalence, train_prevalence)) + results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate)) + results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate)) + results['coverage'].append(region.coverage(true_prevalence)) + results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) + results['test-time'].append(ttime) + + report = { + 'optim_hyper': best_params, + 'optim_score': best_score, + 'refit_time': tr_time, + 'train-prev': train_prevalence, + 'results': {k:np.asarray(v) for k,v in results.items()} + } return report +def experiment_path(dir:Path, dataset_name:str, method_name:str): + os.makedirs(dir, exist_ok=True) + return dir/f'{dataset_name}__{method_name}.pkl' + if __name__ == '__main__': - qp.environ["SAMPLE_SIZE"] = 500 - datasets = qp.datasets.UCI_BINARY_DATASETS - for dataset in datasets: - data = qp.datasets.fetch_UCIBinaryDataset(dataset) - for method_name, method, hyper_params in methods(): - report = experiment(data, method, method_name, hyper_params) - print(f'{method_name=} got {report.mean(numeric_only=True)}') + binary = { + 'datasets': qp.datasets.UCI_BINARY_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, + 'sample_size': 500 + } + + multiclass = { + 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, + 'sample_size': 1000 + } + + result_dir = Path('./results') + + for setup in [binary, multiclass]: + qp.environ['SAMPLE_SIZE'] = setup['sample_size'] + for data_name in setup['datasets']: + data = setup['fetch_fn'](data_name) + is_binary = data.n_classes==2 + result_subdir = result_dir / ('binary' if is_binary else 'multiclass') + for method_name, method, hyper_params in methods(): + if isinstance(method, BinaryQuantifier) and not is_binary: + continue + result_path = experiment_path(result_subdir, data_name, method_name) + report = qp.util.pickled_resource(result_path, experiment, data, method, hyper_params) + print(f'dataset={data_name}, ' + f'method={method_name}: ' + f'mae={report["results"]["ae"].mean():.3f}, ' + f'coverage={report["results"]["coverage"].mean():.3f}, ' + f'amplitude={report["results"]["amplitude"].mean():.3f}, ') diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py new file mode 100644 index 0000000..6d13849 --- /dev/null +++ b/BayesianKDEy/generate_results.py @@ -0,0 +1,31 @@ +import pickle +from collections import defaultdict + +import pandas as pd +from glob import glob +from pathlib import Path + +for setup in ['binary', 'multiclass']: + path = f'./results/{setup}/*.pkl' + table = defaultdict(list) + for file in glob(path): + file = Path(file) + dataset, method = file.name.replace('.pkl', '').split('__') + report = pickle.load(open(file, 'rb')) + results = report['results'] + n_samples = len(results['ae']) + table['method'].extend([method] * n_samples) + table['dataset'].extend([dataset] * n_samples) + table['ae'].extend(results['ae']) + table['coverage'].extend(results['coverage']) + table['amplitude'].extend(results['amplitude']) + + pd.set_option('display.max_columns', None) + pd.set_option('display.width', 1000) + pd.set_option('display.max_rows', None) + df = pd.DataFrame(table) + pv = pd.pivot_table(df, index='dataset', columns='method', values=['ae', 'coverage', 'amplitude']) + print(f'{setup=}') + print(pv) + print() + diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index fd8c84d..88588f1 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -112,7 +112,7 @@ class WithConfidenceABC(ABC): return self.predict_conf(instances=instances, confidence_level=confidence_level) @classmethod - def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'): + def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals')->ConfidenceRegionABC: """ Construct a confidence region given many prevalence estimations. From fd62e73d2d7ec7ba44dc7d0b95d023333b77aed6 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 17 Nov 2025 12:37:12 +0100 Subject: [PATCH 15/41] adding bootstrap variants --- BayesianKDEy/full_experiments.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 2fd7080..a2f3c9c 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -6,13 +6,14 @@ from pathlib import Path from sklearn.linear_model import LogisticRegression import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from method.aggregative import DistributionMatchingY as DMy from quapy.method.base import BinaryQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot -from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap from quapy.functional import strprev -from quapy.method.aggregative import KDEyML +from quapy.method.aggregative import KDEyML, ACC from quapy.protocol import UPP import quapy.functional as F import numpy as np @@ -34,9 +35,14 @@ def new_classifier(): def methods(): cls, cls_hyper = new_classifier() + hdy_hyper = {**cls_hyper, 'n_bins': [3,4,5,8,16,32]} + kdey_hyper = {**cls_hyper, 'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + yield 'BootstrapACC', AggregativeBootstrap(ACC(clone(cls)), n_test_samples=1000), cls_hyper + yield 'BootstrapHDy', AggregativeBootstrap(DMy(clone(cls), divergence='HD'), n_test_samples=1000), hdy_hyper + yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(clone(cls)), n_test_samples=1000), kdey_hyper # yield 'BayesianACC', BayesianCC(clone(cls), mcmc_seed=0), cls_hyper - # yield 'BayesianHDy', PQ(clone(cls), stan_seed=0), {**cls_hyper, 'n_bins': [3,4,5,8,16,32]} - yield 'BayesianKDEy', BayesianKDEy(clone(cls), mcmc_seed=0), {**cls_hyper, 'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + # yield 'BayesianHDy', PQ(clone(cls), stan_seed=0), hdy_hyper + # yield 'BayesianKDEy', BayesianKDEy(clone(cls), mcmc_seed=0), kdey_hyper def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): From fdc0560cccb310de13dd46bfe6a0059d28490baa Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 17 Nov 2025 17:47:14 +0100 Subject: [PATCH 16/41] no optim classifier --- BayesianKDEy/full_experiments.py | 52 ++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index a2f3c9c..6ec372d 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -3,7 +3,10 @@ import warnings from os.path import join from pathlib import Path -from sklearn.linear_model import LogisticRegression +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression as LR +from sklearn.model_selection import GridSearchCV, StratifiedKFold +from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy from method.aggregative import DistributionMatchingY as DMy @@ -24,25 +27,35 @@ from time import time from sklearn.base import clone -def new_classifier(): - # lr_hyper = { - # 'classifier__C': np.logspace(-3,3,7), - # 'classifier__class_weight': ['balanced', None] - # } - lr_hyper = {} - lr = LogisticRegression() - return lr, lr_hyper +# def new_classifier(training): +# print('optimizing hyperparameters of Logistic Regression') +# mod_sel = GridSearchCV( +# estimator=LogisticRegression(), +# param_grid={ +# 'C': np.logspace(-4, 4, 9), +# 'class_weight': ['balanced', None] +# }, +# cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0), +# n_jobs=-1, +# refit=False, +# ) +# mod_sel.fit(*training.Xy) +# # optim = LogisticRegression(**mod_sel.best_params_) +# print(f'Done: hyperparameters chosen={mod_sel.best_params_}') +# # calib = CalibratedClassifierCV(optim, cv=10, n_jobs=-1, ensemble=False).fit(*training.Xy) +# # return calib +# return LogisticRegression(**mod_sel.best_params_) def methods(): - cls, cls_hyper = new_classifier() - hdy_hyper = {**cls_hyper, 'n_bins': [3,4,5,8,16,32]} - kdey_hyper = {**cls_hyper, 'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} - yield 'BootstrapACC', AggregativeBootstrap(ACC(clone(cls)), n_test_samples=1000), cls_hyper - yield 'BootstrapHDy', AggregativeBootstrap(DMy(clone(cls), divergence='HD'), n_test_samples=1000), hdy_hyper - yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(clone(cls)), n_test_samples=1000), kdey_hyper - # yield 'BayesianACC', BayesianCC(clone(cls), mcmc_seed=0), cls_hyper - # yield 'BayesianHDy', PQ(clone(cls), stan_seed=0), hdy_hyper - # yield 'BayesianKDEy', BayesianKDEy(clone(cls), mcmc_seed=0), kdey_hyper + acc_hyper = {} + hdy_hyper = {'n_bins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), acc_hyper + # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), hdy_hyper + # yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), kdey_hyper + # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper + # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper + yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): @@ -74,7 +87,7 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): # test results = defaultdict(list) test_generator = UPP(test, repeats=500, random_state=0) - for i, (sample_X, true_prevalence) in enumerate(test_generator()): + for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'): t_init = time() point_estimate, region = optim_quantifier.predict_conf(sample_X) ttime = time()-t_init @@ -86,6 +99,7 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): results['coverage'].append(region.coverage(true_prevalence)) results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) results['test-time'].append(ttime) + results['samples'].append(optim_quantifier.) report = { 'optim_hyper': best_params, From 9da4fd57db8e9d43182d9f579a035a8b60aa965c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 17 Nov 2025 17:53:56 +0100 Subject: [PATCH 17/41] added property samples to confidence regions --- quapy/method/confidence.py | 53 ++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 07b2b1e..c9e24c4 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -80,6 +80,12 @@ class ConfidenceRegionABC(ABC): proportion = np.clip(self.coverage(uniform_simplex), 0., 1.) return proportion + @property + @abstractmethod + def samples(self): + """ Returns internal samples """ + ... + class WithConfidenceABC(ABC): """ @@ -185,30 +191,35 @@ class ConfidenceEllipseSimplex(ConfidenceRegionABC): """ Instantiates a Confidence Ellipse in the probability simplex. - :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) """ - def __init__(self, X, confidence_level=0.95): + def __init__(self, samples, confidence_level=0.95): assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)' - X = np.asarray(X) + samples = np.asarray(samples) - self.mean_ = X.mean(axis=0) - self.cov_ = np.cov(X, rowvar=False, ddof=1) + self.mean_ = samples.mean(axis=0) + self.cov_ = np.cov(samples, rowvar=False, ddof=1) try: self.precision_matrix_ = np.linalg.inv(self.cov_) except: self.precision_matrix_ = None - self.dim = X.shape[-1] + self.dim = samples.shape[-1] self.ddof = self.dim - 1 # critical chi-square value self.confidence_level = confidence_level self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof) + self._samples = samples + + @property + def samples(self): + return self._samples def point_estimate(self): """ @@ -234,16 +245,21 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC): """ Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. - :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) """ - def __init__(self, X, confidence_level=0.95): - X = np.asarray(X) + def __init__(self, samples, confidence_level=0.95): + samples = np.asarray(samples) self.clr = CLRtransformation() - Z = self.clr(X) - self.mean_ = np.mean(X, axis=0) + Z = self.clr(samples) + self.mean_ = np.mean(samples, axis=0) self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) + self._samples = samples + + @property + def samples(self): + return self._samples def point_estimate(self): """ @@ -273,19 +289,24 @@ class ConfidenceIntervals(ConfidenceRegionABC): """ Instantiates a region based on (independent) Confidence Intervals. - :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) """ - def __init__(self, X, confidence_level=0.95): + def __init__(self, samples, confidence_level=0.95): assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)' - X = np.asarray(X) + samples = np.asarray(samples) - self.means_ = X.mean(axis=0) + self.means_ = samples.mean(axis=0) alpha = 1-confidence_level low_perc = (alpha/2.)*100 high_perc = (1-alpha/2.)*100 - self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0) + self.I_low, self.I_high = np.percentile(samples, q=[low_perc, high_perc], axis=0) + self._samples = samples + + @property + def samples(self): + return self._samples def point_estimate(self): """ From 277a2e617ff3418813b30657aa418809139c973b Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 18 Nov 2025 10:12:41 +0100 Subject: [PATCH 18/41] added aggregative bootstrap --- BayesianKDEy/full_experiments.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 6ec372d..c60b37c 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -9,7 +9,7 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy -from method.aggregative import DistributionMatchingY as DMy +from quapy.method.aggregative import DistributionMatchingY as DMy from quapy.method.base import BinaryQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset @@ -50,12 +50,13 @@ def methods(): acc_hyper = {} hdy_hyper = {'n_bins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} - yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), acc_hyper - # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), hdy_hyper - # yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), kdey_hyper + wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} + # yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper) + yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) + #yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper - yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper + # yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): @@ -99,7 +100,7 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): results['coverage'].append(region.coverage(true_prevalence)) results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) results['test-time'].append(ttime) - results['samples'].append(optim_quantifier.) + results['samples'].append(region.samples) report = { 'optim_hyper': best_params, From 6db659e3c4eb30005042ba32e25c79839e723db3 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 18 Nov 2025 10:13:34 +0100 Subject: [PATCH 19/41] unifying n_bins in PQ and DMy --- quapy/method/confidence.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index a630fe6..c997fb3 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -624,7 +624,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): classifier: BaseEstimator=None, fit_classifier=True, val_split: int = 5, - n_bins: int = 4, + nbins: int = 4, fixed_bins: bool = False, num_warmup: int = 500, num_samples: int = 1_000, @@ -642,7 +642,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): "Run `$ pip install quapy[bayes]` to install them.") super().__init__(classifier, fit_classifier, val_split) - self.n_bins = n_bins + self.nbins = nbins self.fixed_bins = fixed_bins self.num_warmup = num_warmup self.num_samples = num_samples @@ -657,10 +657,10 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): # Compute bin limits if self.fixed_bins: # Uniform bins in [0,1] - self.bin_limits = np.linspace(0, 1, self.n_bins + 1) + self.bin_limits = np.linspace(0, 1, self.nbins + 1) else: # Quantile bins - self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.n_bins + 1)) + self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.nbins + 1)) # Assign each prediction to a bin bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True) @@ -670,14 +670,14 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): neg_mask = ~pos_mask # Count positives and negatives per bin - self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.n_bins) - self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.n_bins) + self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.nbins) + self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.nbins) def aggregate(self, classif_predictions): Px_test = classif_predictions[:, self.pos_label] test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) prevs = _bayesian.pq_stan( - self.stan_code, self.n_bins, self.pos_hist, self.neg_hist, test_hist, + self.stan_code, self.nbins, self.pos_hist, self.neg_hist, test_hist, self.num_samples, self.num_warmup, self.stan_seed ).flatten() self.prev_distribution = np.vstack([1-prevs, prevs]).T From 881e1033f14315b2c7e66be660bc1b24a66aad07 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 26 Nov 2025 15:19:08 +0100 Subject: [PATCH 20/41] lauching experiments --- BayesianKDEy/full_experiments.py | 12 ++++++++---- quapy/method/_bayesian.py | 25 +++++++++++++++++++++++-- quapy/method/confidence.py | 1 + 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index c60b37c..2109127 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -48,14 +48,14 @@ from sklearn.base import clone def methods(): acc_hyper = {} - hdy_hyper = {'n_bins': [3,4,5,8,16,32]} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} # yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper) - yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) - #yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) + # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) + # yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper - # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper + yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper # yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper @@ -137,6 +137,10 @@ if __name__ == '__main__': for setup in [binary, multiclass]: qp.environ['SAMPLE_SIZE'] = setup['sample_size'] for data_name in setup['datasets']: + print(f'dataset={data_name}') + if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): + print(f'skipping dataset: {data_name}') + continue data = setup['fetch_fn'](data_name) is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py index da65eed..23507f7 100644 --- a/quapy/method/_bayesian.py +++ b/quapy/method/_bayesian.py @@ -1,6 +1,10 @@ """ Utility functions for `Bayesian quantification `_ methods. """ +import contextlib +import os +import sys + import numpy as np import importlib.resources @@ -10,6 +14,8 @@ try: import numpyro import numpyro.distributions as dist import stan + import logging + import stan.common DEPENDENCIES_INSTALLED = True except ImportError: @@ -86,6 +92,18 @@ def sample_posterior( def load_stan_file(): return importlib.resources.files('quapy.method').joinpath('stan/pq.stan').read_text(encoding='utf-8') + +@contextlib.contextmanager +def _suppress_stan_logging(): + with open(os.devnull, "w") as devnull: + old_stderr = sys.stderr + sys.stderr = devnull + try: + yield + finally: + sys.stderr = old_stderr + + def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed): """ Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification. @@ -121,6 +139,8 @@ def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, Each element corresponds to one draw from the posterior distribution. """ + logging.getLogger("stan.common").setLevel(logging.ERROR) + stan_data = { 'n_bucket': n_bins, 'train_neg': neg_hist.tolist(), @@ -129,7 +149,8 @@ def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, 'posterior': 1 } - stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed) - fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup) + with _suppress_stan_logging(): + stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed) + fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup) return fit['prev'] diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index e55dde5..b5ebf7e 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -642,6 +642,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): "Run `$ pip install quapy[bayes]` to install them.") super().__init__(classifier, fit_classifier, val_split) + self.nbins = nbins self.fixed_bins = fixed_bins self.num_warmup = num_warmup From 23608f20382ed82a75e1afbbfff951167615ba59 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 10:24:02 +0100 Subject: [PATCH 21/41] adding exploration in CLR --- BayesianKDEy/TODO.txt | 3 + BayesianKDEy/_bayeisan_kdey.py | 35 +++++-- BayesianKDEy/full_experiments.py | 128 ++++++++++++++---------- BayesianKDEy/generate_results.py | 103 +++++++++++++++++-- BayesianKDEy/single_experiment_debug.py | 95 ++++++++++++++++++ quapy/functional.py | 4 +- quapy/method/_kdey.py | 13 +-- quapy/method/confidence.py | 44 +++++++- quapy/model_selection.py | 2 +- 9 files changed, 343 insertions(+), 84 deletions(-) create mode 100644 BayesianKDEy/TODO.txt create mode 100644 BayesianKDEy/single_experiment_debug.py diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt new file mode 100644 index 0000000..ac989dd --- /dev/null +++ b/BayesianKDEy/TODO.txt @@ -0,0 +1,3 @@ +- Add other methods that natively provide uncertainty quantification methods? +- Explore neighbourhood in the CLR space instead than in the simplex! +- \ No newline at end of file diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index d586bef..16401bf 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -1,7 +1,7 @@ from sklearn.base import BaseEstimator import numpy as np from quapy.method._kdey import KDEBase -from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F @@ -40,6 +40,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): mcmc_seed: int = 0, confidence_level: float = 0.95, region: str = 'intervals', + explore_CLR=False, + step_size=0.05, verbose: bool = False): if num_warmup <= 0: @@ -48,13 +50,15 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): raise ValueError(f'parameter {num_samples=} must be a positive integer') super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) self.kernel = self._check_kernel(kernel) self.num_warmup = num_warmup self.num_samples = num_samples self.mcmc_seed = mcmc_seed self.confidence_level = confidence_level self.region = region + self.explore_CLR = explore_CLR + self.step_size = step_size self.verbose = verbose def aggregation_fit(self, classif_predictions, labels): @@ -105,10 +109,19 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): def log_prior(prev): return 0 - def sample_neighbour(prev, step_size=0.05): + def sample_neighbour(prev, step_size): # random-walk Metropolis-Hastings - dir_noise = rng.normal(scale=step_size, size=len(prev)) - neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') + d = len(prev) + if not self.explore_CLR: + dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d) + neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') + else: + clr = CLRtransformation() + clr_point = clr(prev) + dir_noise = rng.normal(scale=step_size, size=d) + clr_neighbour = clr_point+dir_noise + neighbour = clr.inverse(clr_neighbour) + assert in_simplex(neighbour), 'wrong CLR transformation' return neighbour n_classes = X_probs.shape[1] @@ -116,9 +129,9 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) # Metropolis-Hastings with adaptive rate - step_size = 0.05 + step_size = self.step_size target_acceptance = 0.3 - adapt_rate = 0.01 + adapt_rate = 0.05 acceptance_history = [] samples = [] @@ -142,7 +155,13 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: recent_accept_rate = np.mean(acceptance_history[-100:]) step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) + # step_size = float(np.clip(step_size, min_step, max_step)) + print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') # remove "warmup" initial iterations samples = np.asarray(samples[self.num_warmup:]) - return samples \ No newline at end of file + return samples + + +def in_simplex(x): + return np.all(x >= 0) and np.isclose(x.sum(), 1) \ No newline at end of file diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 2109127..2e0c493 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -9,8 +9,9 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy -from quapy.method.aggregative import DistributionMatchingY as DMy -from quapy.method.base import BinaryQuantifier +from build.lib.quapy.data import LabelledCollection +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier +from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot @@ -24,73 +25,95 @@ from tqdm import tqdm from scipy.stats import dirichlet from collections import defaultdict from time import time -from sklearn.base import clone +from sklearn.base import clone, BaseEstimator -# def new_classifier(training): -# print('optimizing hyperparameters of Logistic Regression') -# mod_sel = GridSearchCV( -# estimator=LogisticRegression(), -# param_grid={ -# 'C': np.logspace(-4, 4, 9), -# 'class_weight': ['balanced', None] -# }, -# cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0), -# n_jobs=-1, -# refit=False, -# ) -# mod_sel.fit(*training.Xy) -# # optim = LogisticRegression(**mod_sel.best_params_) -# print(f'Done: hyperparameters chosen={mod_sel.best_params_}') -# # calib = CalibratedClassifierCV(optim, cv=10, n_jobs=-1, ensemble=False).fit(*training.Xy) -# # return calib -# return LogisticRegression(**mod_sel.best_params_) +class KDEyCLR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='aitchison' + ) -def methods(): +def methods__(): acc_hyper = {} hdy_hyper = {'nbins': [3,4,5,8,16,32]} - kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]} wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} # yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper) # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) - # yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) + yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper - yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper + # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper # yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper -def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): +def methods(): + """ + Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: + - name: is a str representing the name of the method (e.g., 'BayesianKDEy') + - quantifier: is the base model (e.g., KDEyML()) + - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) + - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the + quantifier with optimized hyperparameters + """ + acc_hyper = {} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), + yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0) + + yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), + + yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), + yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), + yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), + + +def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): with qp.util.temp_seed(0): + print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') # model selection - train, test = dataset.train_test - train_prevalence = train.prevalence() if len(grid)>0: train, val = train.split_stratified(train_prop=0.6, random_state=0) mod_sel = GridSearchQ( - model=method, + model=point_quantifier, param_grid=grid, protocol=qp.protocol.UPP(val, repeats=250, random_state=0), - refit=True, + refit=False, n_jobs=-1, verbose=True ).fit(*train.Xy) - optim_quantifier = mod_sel.best_model() best_params = mod_sel.best_params_ - best_score = mod_sel.best_score_ - tr_time = mod_sel.refit_time_ else: - t_init = time() - method.fit(*train.Xy) - tr_time = time() - t_init - best_params, best_score = {}, -1 - optim_quantifier = method + best_params = {} + + return best_params + + +def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path): + with qp.util.temp_seed(0): + + training, test = dataset.train_test + + # model selection + best_hyperparams = qp.util.pickled_resource( + hyper_choice_path, model_selection, training, cp(point_quantifier), grid + ) + + t_init = time() + withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy) + tr_time = time() - t_init # test + train_prevalence = training.prevalence() results = defaultdict(list) - test_generator = UPP(test, repeats=500, random_state=0) + test_generator = UPP(test, repeats=100, random_state=0) for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'): t_init = time() - point_estimate, region = optim_quantifier.predict_conf(sample_X) + point_estimate, region = withconf_quantifier.predict_conf(sample_X) ttime = time()-t_init results['true-prevs'].append(true_prevalence) results['point-estim'].append(point_estimate) @@ -103,9 +126,8 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): results['samples'].append(region.samples) report = { - 'optim_hyper': best_params, - 'optim_score': best_score, - 'refit_time': tr_time, + 'optim_hyper': best_hyperparams, + 'train_time': tr_time, 'train-prev': train_prevalence, 'results': {k:np.asarray(v) for k,v in results.items()} } @@ -134,26 +156,30 @@ if __name__ == '__main__': result_dir = Path('./results') - for setup in [binary, multiclass]: + for setup in [binary, multiclass]: # [binary, multiclass]: qp.environ['SAMPLE_SIZE'] = setup['sample_size'] for data_name in setup['datasets']: print(f'dataset={data_name}') - if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): - print(f'skipping dataset: {data_name}') - continue + # if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): + # print(f'skipping dataset: {data_name}') + # continue data = setup['fetch_fn'](data_name) is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') - for method_name, method, hyper_params in methods(): + hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') + for method_name, method, hyper_params, withconf_constructor in methods(): if isinstance(method, BinaryQuantifier) and not is_binary: continue result_path = experiment_path(result_subdir, data_name, method_name) - report = qp.util.pickled_resource(result_path, experiment, data, method, hyper_params) + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = qp.util.pickled_resource( + result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path + ) print(f'dataset={data_name}, ' f'method={method_name}: ' f'mae={report["results"]["ae"].mean():.3f}, ' - f'coverage={report["results"]["coverage"].mean():.3f}, ' - f'amplitude={report["results"]["amplitude"].mean():.3f}, ') + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 6d13849..1cd309d 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -1,31 +1,112 @@ import pickle from collections import defaultdict +from joblib import Parallel, delayed +from tqdm import tqdm import pandas as pd from glob import glob from pathlib import Path +import quapy as qp +from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR + +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 2000) +pd.set_option('display.max_rows', None) +pd.set_option("display.expand_frame_repr", False) +pd.set_option("display.precision", 4) +pd.set_option("display.float_format", "{:.4f}".format) + + +def compute_coverage_amplitude(region_constructor): + all_samples = results['samples'] + all_true_prevs = results['true-prevs'] + + def process_one(samples, true_prevs): + ellipse = region_constructor(samples) + return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion() + + out = Parallel(n_jobs=3)( + delayed(process_one)(samples, true_prevs) + for samples, true_prevs in tqdm( + zip(all_samples, all_true_prevs), + total=len(all_samples), + desc='constructing ellipses' + ) + ) + + # unzip results + coverage, amplitude = zip(*out) + return list(coverage), list(amplitude) + + +def update_pickle(report, pickle_path, updated_dict:dict): + for k,v in updated_dict.items(): + report[k]=v + pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + for setup in ['binary', 'multiclass']: path = f'./results/{setup}/*.pkl' table = defaultdict(list) - for file in glob(path): + for file in tqdm(glob(path), desc='processing results', total=len(glob(path))): file = Path(file) dataset, method = file.name.replace('.pkl', '').split('__') report = pickle.load(open(file, 'rb')) results = report['results'] n_samples = len(results['ae']) - table['method'].extend([method] * n_samples) + table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples) table['dataset'].extend([dataset] * n_samples) table['ae'].extend(results['ae']) - table['coverage'].extend(results['coverage']) - table['amplitude'].extend(results['amplitude']) + table['c-CI'].extend(results['coverage']) + table['a-CI'].extend(results['amplitude']) + + if 'coverage-CE' not in report: + covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) + covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) + + update_fields = { + 'coverage-CE': covCE, + 'amplitude-CE': ampCE, + 'coverage-CLR': covCLR, + 'amplitude-CLR': ampCLR + } + + update_pickle(report, file, update_fields) + + table['c-CE'].extend(report['coverage-CE']) + table['a-CE'].extend(report['amplitude-CE']) + + table['c-CLR'].extend(report['coverage-CLR']) + table['a-CLR'].extend(report['amplitude-CLR']) + - pd.set_option('display.max_columns', None) - pd.set_option('display.width', 1000) - pd.set_option('display.max_rows', None) df = pd.DataFrame(table) - pv = pd.pivot_table(df, index='dataset', columns='method', values=['ae', 'coverage', 'amplitude']) - print(f'{setup=}') - print(pv) - print() + + n_classes = {} + tr_size = {} + for dataset in df['dataset'].unique(): + fetch_fn = { + 'binary': qp.datasets.fetch_UCIBinaryDataset, + 'multiclass': qp.datasets.fetch_UCIMulticlassDataset + }[setup] + data = fetch_fn(dataset) + n_classes[dataset] = data.n_classes + tr_size[dataset] = len(data.training) + + # remove datasets with more than max_classes classes + max_classes = 30 + for data_name, n in n_classes.items(): + if n > max_classes: + df = df[df["dataset"] != data_name] + + for region in ['CI', 'CE', 'CLR']: + pv = pd.pivot_table( + df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True + ) + pv['n_classes'] = pv.index.map(n_classes).astype('Int64') + pv['tr_size'] = pv.index.map(tr_size).astype('Int64') + pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"]) + print(f'{setup=}') + print(pv) + print('-'*80) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py new file mode 100644 index 0000000..6cbe8a7 --- /dev/null +++ b/BayesianKDEy/single_experiment_debug.py @@ -0,0 +1,95 @@ +import os +import warnings +from os.path import join +from pathlib import Path + +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression as LR +from sklearn.model_selection import GridSearchCV, StratifiedKFold +from copy import deepcopy as cp +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR +from build.lib.quapy.data import LabelledCollection +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier +from quapy.method.base import BinaryQuantifier, BaseQuantifier +from quapy.model_selection import GridSearchQ +from quapy.data import Dataset +# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML, ACC +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet +from collections import defaultdict +from time import time +from sklearn.base import clone, BaseEstimator + + +def method(): + """ + Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: + - name: is a str representing the name of the method (e.g., 'BayesianKDEy') + - quantifier: is the base model (e.g., KDEyML()) + - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) + - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the + quantifier with optimized hyperparameters + """ + acc_hyper = {} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} + + # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), + # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), + return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, + explore_CLR=True, + step_size=.15, + # num_warmup = 5000, + # num_samples = 10_000, + # region='ellipse', + **hyper), + + + +if __name__ == '__main__': + + binary = { + 'datasets': qp.datasets.UCI_BINARY_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, + 'sample_size': 500 + } + + multiclass = { + 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, + 'sample_size': 1000 + } + + result_dir = Path('./results') + + setup = multiclass + qp.environ['SAMPLE_SIZE'] = setup['sample_size'] + data_name = 'digits' + print(f'dataset={data_name}') + data = setup['fetch_fn'](data_name) + is_binary = data.n_classes==2 + hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') + method_name, method, hyper_params, withconf_constructor = method() + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path) + + print(f'dataset={data_name}, ' + f'method={method_name}: ' + f'mae={report["results"]["ae"].mean():.3f}, ' + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') + + + + diff --git a/quapy/functional.py b/quapy/functional.py index 408c62a..5f6b915 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -583,8 +583,8 @@ def solve_adjustment( """ Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of `unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of - :math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an - estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`. + :math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an + estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`. :param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j` diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 5f80f8d..8475ee7 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -33,7 +33,7 @@ class KDEBase: @classmethod - def _check_bandwidth(cls, bandwidth): + def _check_bandwidth(cls, bandwidth, kernel): """ Checks that the bandwidth parameter is correct @@ -43,8 +43,9 @@ class KDEBase: assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): - assert 0 < bandwidth < 1, \ - "the bandwidth for KDEy should be in (0,1), since this method models the unit simplex" + assert kernel!='gaussian' or (0 < bandwidth < 1), \ + ("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), " + "since this method models the unit simplex") return bandwidth @classmethod @@ -166,7 +167,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian', random_state=None): super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) self.kernel = self._check_kernel(kernel) self.random_state=random_state @@ -246,7 +247,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase): super().__init__(classifier, fit_classifier, val_split) self.divergence = divergence - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian') self.random_state=random_state self.montecarlo_trials = montecarlo_trials @@ -333,7 +334,7 @@ class KDEyCS(AggregativeSoftQuantifier): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1): super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian') def gram_matrix_mix_sum(self, X, Y=None): # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index b5ebf7e..c308dfb 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -1,4 +1,5 @@ import numpy as np +from joblib import Parallel, delayed from sklearn.base import BaseEstimator from sklearn.metrics import confusion_matrix @@ -13,6 +14,7 @@ from abc import ABC, abstractmethod from scipy.special import softmax, factorial import copy from functools import lru_cache +from tqdm import tqdm """ This module provides implementation of different types of confidence regions, and the implementation of Bootstrap @@ -399,7 +401,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): n_test_samples=500, confidence_level=0.95, region='intervals', - random_state=None): + random_state=None, + verbose=False): assert isinstance(quantifier, AggregativeQuantifier), \ f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}' @@ -416,6 +419,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): self.confidence_level = confidence_level self.region = region self.random_state = random_state + self.verbose = verbose def aggregation_fit(self, classif_predictions, labels): data = LabelledCollection(classif_predictions, labels, classes=self.classes_) @@ -441,6 +445,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): prev_mean, self.confidence = self.aggregate_conf(classif_predictions) return prev_mean + def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None): + if confidence_level is None: + confidence_level = self.confidence_level + + n_samples = classif_predictions.shape[0] + prevs = [] + with qp.util.temp_seed(self.random_state): + for quantifier in self.quantifiers: + for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose): + sample_i = resample(classif_predictions, n_samples=n_samples) + prev_i = quantifier.aggregate(sample_i) + prevs.append(prev_i) + + conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None): if confidence_level is None: confidence_level = self.confidence_level @@ -449,10 +471,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): prevs = [] with qp.util.temp_seed(self.random_state): for quantifier in self.quantifiers: - for i in range(self.n_test_samples): - sample_i = resample(classif_predictions, n_samples=n_samples) - prev_i = quantifier.aggregate(sample_i) - prevs.append(prev_i) + results = Parallel(n_jobs=-1)( + delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples) + for i in range(self.n_test_samples) + ) + prevs.extend(results) + # for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose): + # sample_i = resample(classif_predictions, n_samples=n_samples) + # prev_i = quantifier.aggregate(sample_i) + # prevs.append(prev_i) conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) prev_estim = conf.point_estimate() @@ -477,6 +504,13 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): return self.quantifier._classifier_method() +def bootstrap_once(i, classif_predictions, quantifier, n_samples): + idx = np.random.randint(0, len(classif_predictions), n_samples) + sample = classif_predictions[idx] + prev = quantifier.aggregate(sample) + return prev + + class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ `Bayesian quantification `_ method (by Albert Ziegler and Paweł Czyż), diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 0937fa8..c357082 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -410,7 +410,7 @@ def group_params(param_grid: dict): """ classifier_params, quantifier_params = {}, {} for key, values in param_grid.items(): - if key.startswith('classifier__') or key == 'val_split': + if 'classifier__' in key or key == 'val_split': classifier_params[key] = values else: quantifier_params[key] = values From bfb64824105026329d418e8ecb9b58240ae9d864 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 10:34:38 +0100 Subject: [PATCH 22/41] launching kde** --- BayesianKDEy/full_experiments.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 2e0c493..ad8bbf4 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -35,18 +35,6 @@ class KDEyCLR(KDEyML): random_state=random_state, kernel='aitchison' ) -def methods__(): - acc_hyper = {} - hdy_hyper = {'nbins': [3,4,5,8,16,32]} - kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]} - wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} - # yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper) - # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) - yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) - # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper - # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper - # yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper - def methods(): """ @@ -66,10 +54,12 @@ def methods(): yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0) yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), + # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), + yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.15, **hyper), def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): From 1080973d255864830e5d9c47f70bd63dafad805f Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 15:42:00 +0100 Subject: [PATCH 23/41] recomputing confidence regions --- BayesianKDEy/TODO.txt | 1 + BayesianKDEy/generate_results.py | 20 +++--- BayesianKDEy/plot_simplex.py | 117 ++++++++++++++++++++++++++----- quapy/method/confidence.py | 35 ++++++++- 4 files changed, 143 insertions(+), 30 deletions(-) diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt index ac989dd..5198f39 100644 --- a/BayesianKDEy/TODO.txt +++ b/BayesianKDEy/TODO.txt @@ -1,3 +1,4 @@ - Add other methods that natively provide uncertainty quantification methods? - Explore neighbourhood in the CLR space instead than in the simplex! +- CI con Bonferroni - \ No newline at end of file diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 1cd309d..5aef993 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -60,18 +60,18 @@ for setup in ['binary', 'multiclass']: table['c-CI'].extend(results['coverage']) table['a-CI'].extend(results['amplitude']) - if 'coverage-CE' not in report: - covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) - covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) + # if 'coverage-CE' not in report: + covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) + covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) - update_fields = { - 'coverage-CE': covCE, - 'amplitude-CE': ampCE, - 'coverage-CLR': covCLR, - 'amplitude-CLR': ampCLR - } + update_fields = { + 'coverage-CE': covCE, + 'amplitude-CE': ampCE, + 'coverage-CLR': covCLR, + 'amplitude-CLR': ampCLR + } - update_pickle(report, file, update_fields) + update_pickle(report, file, update_fields) table['c-CE'].extend(report['coverage-CE']) table['a-CE'].extend(report['amplitude-CE']) diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index 515db3f..ebf84c7 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -1,9 +1,21 @@ +import os +import pickle +from pathlib import Path + import numpy as np import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap from scipy.stats import gaussian_kde +from method.confidence import ConfidenceIntervals, ConfidenceEllipseSimplex, ConfidenceEllipseCLR + + +def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=None, show_mean=True, show_legend=True, + region=None, + region_resolution=1000, + confine_region_in_simplex=False, + save_path=None): -def plot_prev_points(prevs, true_prev, point_estim, train_prev): plt.rcParams.update({ 'font.size': 10, # tamaño base de todo el texto 'axes.titlesize': 12, # título del eje @@ -20,6 +32,15 @@ def plot_prev_points(prevs, true_prev, point_estim, train_prev): y = p[:, 2] * np.sqrt(3) / 2 return x, y + def barycentric_from_xy(x, y): + """ + Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3). + """ + p3 = 2 * y / np.sqrt(3) + p2 = x - 0.5 * p3 + p1 = 1 - p2 - p3 + return np.stack([p1, p2, p3], axis=-1) + # simplex coordinates v1 = np.array([0, 0]) v2 = np.array([1, 0]) @@ -27,31 +48,78 @@ def plot_prev_points(prevs, true_prev, point_estim, train_prev): # Plot fig, ax = plt.subplots(figsize=(6, 6)) - ax.scatter(*cartesian(prevs), s=10, alpha=0.5, edgecolors='none', label='samples') - ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') - ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') - ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') - ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black') + + if region is not None: + # rectangular mesh + xs = np.linspace(0, 1, region_resolution) + ys = np.linspace(0, np.sqrt(3)/2, region_resolution) + grid_x, grid_y = np.meshgrid(xs, ys) + + # 2 barycentric + pts_bary = barycentric_from_xy(grid_x, grid_y) + + # mask within simplex + if confine_region_in_simplex: + in_simplex = np.all(pts_bary >= 0, axis=-1) + else: + in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool) + + # evaluar la región solo en puntos válidos + mask = np.zeros_like(in_simplex, dtype=float) + valid_pts = pts_bary[in_simplex] + + mask_vals = np.array([float(region(p)) for p in valid_pts]) + mask[in_simplex] = mask_vals + + # pintar el fondo + + white_and_color = ListedColormap([ + (1, 1, 1, 1), # color for value 0 + (0.7, .0, .0, .5) # color for value 1 + ]) + + + ax.pcolormesh( + xs, ys, mask, + shading='auto', + cmap=white_and_color, + alpha=0.5 + ) + + ax.scatter(*cartesian(prevs), s=15, alpha=0.5, edgecolors='none', label='samples') + if show_mean: + ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') + if train_prev is not None: + ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') + if point_estim is not None: + ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') + if train_prev is not None: + ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black') + + # edges triangle = np.array([v1, v2, v3, v1]) ax.plot(triangle[:, 0], triangle[:, 1], color='black') # vertex labels - ax.text(-0.05, -0.05, "y=0", ha='right', va='top') - ax.text(1.05, -0.05, "y=1", ha='left', va='top') - ax.text(0.5, np.sqrt(3)/2 + 0.05, "y=2", ha='center', va='bottom') + ax.text(-0.05, -0.05, "Y=1", ha='right', va='top') + ax.text(1.05, -0.05, "Y=2", ha='left', va='top') + ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha='center', va='bottom') ax.set_aspect('equal') ax.axis('off') - plt.legend( - loc='center left', - bbox_to_anchor=(1.05, 0.5), - # ncol=3, - # frameon=False - ) + if show_legend: + plt.legend( + loc='center left', + bbox_to_anchor=(1.05, 0.5), + ) plt.tight_layout() - plt.show() + if save_path is None: + plt.show() + else: + os.makedirs(Path(save_path).parent, exist_ok=True) + plt.savefig(save_path) def plot_prev_points_matplot(points): @@ -107,6 +175,19 @@ def plot_prev_points_matplot(points): plt.show() if __name__ == '__main__': + np.random.seed(1) + n = 1000 - points = np.random.dirichlet([2, 3, 4], size=n) - plot_prev_points_matplot(points) + alpha = [3,5,10] + prevs = np.random.dirichlet(alpha, size=n) + + def regions(): + yield 'CI', ConfidenceIntervals(prevs) + yield 'CE', ConfidenceEllipseSimplex(prevs) + yield 'CLR', ConfidenceEllipseCLR(prevs) + + alpha_str = ','.join([f'{str(i)}' for i in alpha]) + for crname, cr in regions(): + plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr.coverage, region_resolution=5000, + save_path=f'./plots/simplex_{crname}_alpha{alpha_str}.png') + diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index c308dfb..6ff713a 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -155,7 +155,7 @@ def simplex_volume(n): return 1 / factorial(n) -def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): +def within_ellipse_prop__(values, mean, prec_matrix, chi2_critical): """ Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix` at a distance `chi2_critical`. @@ -188,6 +188,36 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): return within_elipse * 1.0 +def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): + """ + Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix` + at a distance `chi2_critical`. + + :param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,) + :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse + :param prec_matrix: a np.ndarray with the precision matrix (inverse of the + covariance matrix) of the ellipse. If this inverse cannot be computed + then None must be passed + :param chi2_critical: float, the chi2 critical value + + :return: float in [0,1], the fraction of values that are contained in the ellipse + defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance). + If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned. + """ + if prec_matrix is None: + return 0. + + values = np.atleast_2d(values) + diff = values - mean + d_M_squared = np.sum(diff @ prec_matrix * diff, axis=-1) + within_ellipse = d_M_squared <= chi2_critical + + if len(within_ellipse) == 1: + return float(within_ellipse[0]) + else: + return float(np.mean(within_ellipse)) + + class ConfidenceEllipseSimplex(ConfidenceRegionABC): """ Instantiates a Confidence Ellipse in the probability simplex. @@ -206,7 +236,7 @@ class ConfidenceEllipseSimplex(ConfidenceRegionABC): self.cov_ = np.cov(samples, rowvar=False, ddof=1) try: - self.precision_matrix_ = np.linalg.inv(self.cov_) + self.precision_matrix_ = np.linalg.pinv(self.cov_) except: self.precision_matrix_ = None @@ -354,6 +384,7 @@ class CLRtransformation: G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean return np.log(X / G) + def inverse(self, X): """ Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. From 78fd05ab337152a65b5933b7050d1d048a9bd1e3 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 15:43:31 +0100 Subject: [PATCH 24/41] repairing safe check --- BayesianKDEy/generate_results.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 5aef993..1cd309d 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -60,18 +60,18 @@ for setup in ['binary', 'multiclass']: table['c-CI'].extend(results['coverage']) table['a-CI'].extend(results['amplitude']) - # if 'coverage-CE' not in report: - covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) - covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) + if 'coverage-CE' not in report: + covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) + covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) - update_fields = { - 'coverage-CE': covCE, - 'amplitude-CE': ampCE, - 'coverage-CLR': covCLR, - 'amplitude-CLR': ampCLR - } + update_fields = { + 'coverage-CE': covCE, + 'amplitude-CE': ampCE, + 'coverage-CLR': covCLR, + 'amplitude-CLR': ampCLR + } - update_pickle(report, file, update_fields) + update_pickle(report, file, update_fields) table['c-CE'].extend(report['coverage-CE']) table['a-CE'].extend(report['amplitude-CE']) From 90981088b0b9b5956b3beefb78bdd300a6c275ae Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 18:00:01 +0100 Subject: [PATCH 25/41] launching PQ --- BayesianKDEy/full_experiments.py | 5 +++-- quapy/method/confidence.py | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index ad8bbf4..3c99bc5 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -54,12 +54,13 @@ def methods(): yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0) yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), - # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper + yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), - yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.15, **hyper), + yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.1, **hyper), + # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.15, **hyper), def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 6ff713a..7a0161b 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -323,13 +323,16 @@ class ConfidenceIntervals(ConfidenceRegionABC): :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) """ - def __init__(self, samples, confidence_level=0.95): + def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False): assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)' samples = np.asarray(samples) self.means_ = samples.mean(axis=0) alpha = 1-confidence_level + if bonferroni_correction: + n_classes = samples.shape[-1] + alpha = alpha/n_classes low_perc = (alpha/2.)*100 high_perc = (1-alpha/2.)*100 self.I_low, self.I_high = np.percentile(samples, q=[low_perc, high_perc], axis=0) From d87625bd097b33394b6d0c72822fc68711337705 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 18:27:15 +0100 Subject: [PATCH 26/41] adding new config for bayesian --- BayesianKDEy/full_experiments.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 3c99bc5..9f83f29 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -49,18 +49,20 @@ def methods(): hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + only_binary = True + multiclass_method = False - yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), - yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0) + yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method + yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method - yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), - yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), + yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method + yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary - yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), - yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), - yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), - yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.1, **hyper), - # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.15, **hyper), + yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method + yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method + yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method + yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.15, **hyper), multiclass_method + yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.05, **hyper), multiclass_method def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): @@ -158,8 +160,8 @@ if __name__ == '__main__': is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') - for method_name, method, hyper_params, withconf_constructor in methods(): - if isinstance(method, BinaryQuantifier) and not is_binary: + for method_name, method, hyper_params, withconf_constructor, only_binary in methods(): + if only_binary and not is_binary: continue result_path = experiment_path(result_subdir, data_name, method_name) hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) From b180aae16c933659e482478b3ff68de6ce85585d Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 19:16:52 +0100 Subject: [PATCH 27/41] added ILR to conf regions --- BayesianKDEy/generate_results.py | 33 ++++--- BayesianKDEy/plot_simplex.py | 8 +- quapy/method/confidence.py | 158 ++++++++++++++++++++++++------- 3 files changed, 146 insertions(+), 53 deletions(-) diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 1cd309d..b8127e3 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -7,7 +7,7 @@ import pandas as pd from glob import glob from pathlib import Path import quapy as qp -from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR +from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR pd.set_option('display.max_columns', None) pd.set_option('display.width', 2000) @@ -45,6 +45,17 @@ def update_pickle(report, pickle_path, updated_dict:dict): pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) +def update_pickle_with_region(report, file, conf_name, conf_region_class): + if f'coverage-{conf_name}' not in report: + cov, amp = compute_coverage_amplitude(conf_region_class) + + update_fields = { + f'coverage-{conf_name}': cov, + f'amplitude-{conf_name}': amp, + } + + update_pickle(report, file, update_fields) + for setup in ['binary', 'multiclass']: path = f'./results/{setup}/*.pkl' table = defaultdict(list) @@ -60,18 +71,9 @@ for setup in ['binary', 'multiclass']: table['c-CI'].extend(results['coverage']) table['a-CI'].extend(results['amplitude']) - if 'coverage-CE' not in report: - covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) - covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) - - update_fields = { - 'coverage-CE': covCE, - 'amplitude-CE': ampCE, - 'coverage-CLR': covCLR, - 'amplitude-CLR': ampCLR - } - - update_pickle(report, file, update_fields) + update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex) + update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR) + update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR) table['c-CE'].extend(report['coverage-CE']) table['a-CE'].extend(report['amplitude-CE']) @@ -79,6 +81,9 @@ for setup in ['binary', 'multiclass']: table['c-CLR'].extend(report['coverage-CLR']) table['a-CLR'].extend(report['amplitude-CLR']) + table['c-ILR'].extend(report['coverage-ILR']) + table['a-ILR'].extend(report['amplitude-ILR']) + df = pd.DataFrame(table) @@ -99,7 +104,7 @@ for setup in ['binary', 'multiclass']: if n > max_classes: df = df[df["dataset"] != data_name] - for region in ['CI', 'CE', 'CLR']: + for region in ['CI', 'CE', 'CLR', 'ILR']: pv = pd.pivot_table( df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True ) diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index ebf84c7..8ca8e8c 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -7,7 +7,7 @@ import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from scipy.stats import gaussian_kde -from method.confidence import ConfidenceIntervals, ConfidenceEllipseSimplex, ConfidenceEllipseCLR +from method.confidence import ConfidenceIntervals, ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=None, show_mean=True, show_legend=True, @@ -185,9 +185,11 @@ if __name__ == '__main__': yield 'CI', ConfidenceIntervals(prevs) yield 'CE', ConfidenceEllipseSimplex(prevs) yield 'CLR', ConfidenceEllipseCLR(prevs) + yield 'ILR', ConfidenceEllipseILR(prevs) + resolution = 100 alpha_str = ','.join([f'{str(i)}' for i in alpha]) for crname, cr in regions(): - plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr.coverage, region_resolution=5000, - save_path=f'./plots/simplex_{crname}_alpha{alpha_str}.png') + plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr.coverage, region_resolution=resolution, + save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png') diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 7a0161b..33a475d 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -218,6 +218,98 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): return float(np.mean(within_ellipse)) +class CompositionalTransformation(ABC): + """ + Abstract class of transformations from compositional data. + Basically, callable functions with an "inverse" function. + """ + @abstractmethod + def __call__(self, X): ... + + @abstractmethod + def inverse(self, Z): ... + + EPSILON=1e-6 + + + +class CLRtransformation(CompositionalTransformation): + """ + Centered log-ratio (CLR), from compositional analysis + """ + def __call__(self, X): + """ + Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but + actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` + + :param X: np.ndarray of (n_instances, n_dimensions) to be transformed + :param epsilon: small float for prevalence smoothing + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + X = np.asarray(X) + X = qp.error.smooth(X, self.EPSILON) + G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean + return np.log(X / G) + + def inverse(self, Z): + """ + Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. + + :param Z: np.ndarray of (n_instances, n_dimensions) to be transformed + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + return softmax(Z, axis=-1) + + +class ILRtransformation(CompositionalTransformation): + """ + Isometric log-ratio (ILR), from compositional analysis + """ + def __call__(self, X): + X = np.asarray(X) + X = qp.error.smooth(X, self.EPSILON) + k = X.shape[-1] + V = self.get_V(k) # (k-1, k) + logp = np.log(X) + return logp @ V.T + + def inverse(self, Z): + Z = np.asarray(Z) + # get dimension + k_minus_1 = Z.shape[-1] + k = k_minus_1 + 1 + V = self.get_V(k) # (k-1, k) + logp = Z @ V + p = np.exp(logp) + p = p / np.sum(p, axis=-1, keepdims=True) + return p + + @lru_cache(maxsize=None) + def get_V(self, k): + def helmert_matrix(k): + """ + Returns the (k x k) Helmert matrix. + """ + H = np.zeros((k, k)) + for i in range(1, k): + H[i, :i] = 1 + H[i, i] = -(i) + H[i] = H[i] / np.sqrt(i * (i + 1)) + # row 0 stays zeros; will be discarded + return H + + def ilr_basis(k): + """ + Constructs an orthonormal ILR basis using the Helmert submatrix. + Output shape: (k-1, k) + """ + H = helmert_matrix(k) + V = H[1:, :] # remove first row of zeros + return V + + return ilr_basis(k) + + class ConfidenceEllipseSimplex(ConfidenceRegionABC): """ Instantiates a Confidence Ellipse in the probability simplex. @@ -272,20 +364,20 @@ class ConfidenceEllipseSimplex(ConfidenceRegionABC): return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_) -class ConfidenceEllipseCLR(ConfidenceRegionABC): +class ConfidenceEllipseTransformed(ConfidenceRegionABC): """ - Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. + Instantiates a Confidence Ellipse in a transformed space. :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) """ - def __init__(self, samples, confidence_level=0.95): + def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95): samples = np.asarray(samples) - self.clr = CLRtransformation() - Z = self.clr(samples) + self.transformation = transformation + Z = self.transformation(samples) self.mean_ = np.mean(samples, axis=0) - self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) + self.conf_region_z = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) self._samples = samples @property @@ -312,8 +404,30 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC): :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) :return: float in [0,1] """ - transformed_values = self.clr(true_value) - return self.conf_region_clr.coverage(transformed_values) + transformed_values = self.transformation(true_value) + return self.conf_region_z.coverage(transformed_values) + + +class ConfidenceEllipseCLR(ConfidenceEllipseTransformed): + """ + Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, samples, confidence_level=0.95): + super().__init__(samples, CLRtransformation(), confidence_level=confidence_level) + + +class ConfidenceEllipseILR(ConfidenceEllipseTransformed): + """ + Instantiates a Confidence Ellipse in the Isometric-Log Ratio (CLR) space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, samples, confidence_level=0.95): + super().__init__(samples, ILRtransformation(), confidence_level=confidence_level) class ConfidenceIntervals(ConfidenceRegionABC): @@ -369,34 +483,6 @@ class ConfidenceIntervals(ConfidenceRegionABC): return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']' -class CLRtransformation: - """ - Centered log-ratio, from component analysis - """ - def __call__(self, X, epsilon=1e-6): - """ - Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but - actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` - - :param X: np.ndarray of (n_instances, n_dimensions) to be transformed - :param epsilon: small float for prevalence smoothing - :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points - """ - X = np.asarray(X) - X = qp.error.smooth(X, epsilon) - G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean - return np.log(X / G) - - - def inverse(self, X): - """ - Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. - - :param X: np.ndarray of (n_instances, n_dimensions) to be transformed - :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points - """ - return softmax(X, axis=-1) - class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): """ From e8d175106f112f8afd33745366866f9929932f70 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 20:02:26 +0100 Subject: [PATCH 28/41] adding experiment with ILR --- BayesianKDEy/_bayeisan_kdey.py | 20 +++++++++++++++----- BayesianKDEy/full_experiments.py | 5 +++-- BayesianKDEy/generate_results.py | 2 ++ BayesianKDEy/single_experiment_debug.py | 2 +- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 16401bf..37fd280 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -1,7 +1,7 @@ from sklearn.base import BaseEstimator import numpy as np from quapy.method._kdey import KDEBase -from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation, ILRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F @@ -40,7 +40,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): mcmc_seed: int = 0, confidence_level: float = 0.95, region: str = 'intervals', - explore_CLR=False, + explore='simplex', step_size=0.05, verbose: bool = False): @@ -48,6 +48,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): raise ValueError(f'parameter {num_warmup=} must be a positive integer') if num_samples <= 0: raise ValueError(f'parameter {num_samples=} must be a positive integer') + assert explore in ['simplex', 'clr', 'ilr'], \ + f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"' super().__init__(classifier, fit_classifier, val_split) self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) @@ -57,7 +59,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): self.mcmc_seed = mcmc_seed self.confidence_level = confidence_level self.region = region - self.explore_CLR = explore_CLR + self.explore = explore self.step_size = step_size self.verbose = verbose @@ -112,16 +114,24 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): def sample_neighbour(prev, step_size): # random-walk Metropolis-Hastings d = len(prev) - if not self.explore_CLR: + neighbour = None + if self.explore=='simplex': dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d) neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') - else: + elif self.explore=='clr': clr = CLRtransformation() clr_point = clr(prev) dir_noise = rng.normal(scale=step_size, size=d) clr_neighbour = clr_point+dir_noise neighbour = clr.inverse(clr_neighbour) assert in_simplex(neighbour), 'wrong CLR transformation' + elif self.explore=='ilr': + ilr = ILRtransformation() + ilr_point = ilr(prev) + dir_noise = rng.normal(scale=step_size, size=d) + ilr_neighbour = ilr_point + dir_noise + neighbour = ilr.inverse(ilr_neighbour) + assert in_simplex(neighbour), 'wrong ILR transformation' return neighbour n_classes = X_probs.shape[1] diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 9f83f29..d821cb5 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -61,8 +61,9 @@ def methods(): yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method - yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.15, **hyper), multiclass_method - yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore_CLR=True, step_size=.05, **hyper), multiclass_method + yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method + # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method + yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), multiclass_method def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index b8127e3..b9b0e4e 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -105,6 +105,8 @@ for setup in ['binary', 'multiclass']: df = df[df["dataset"] != data_name] for region in ['CI', 'CE', 'CLR', 'ILR']: + if setup == 'binary' and region=='ILR': + continue pv = pd.pivot_table( df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True ) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index 6cbe8a7..fb3be01 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -48,7 +48,7 @@ def method(): # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, - explore_CLR=True, + explore=True, step_size=.15, # num_warmup = 5000, # num_samples = 10_000, From a6974b762490faf2f8ffab886da18ade0cd2b06a Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 20:03:30 +0100 Subject: [PATCH 29/41] adding experiment with ILR --- BayesianKDEy/_bayeisan_kdey.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 37fd280..8f77fb1 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -166,7 +166,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): recent_accept_rate = np.mean(acceptance_history[-100:]) step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) # step_size = float(np.clip(step_size, min_step, max_step)) - print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') + # print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') # remove "warmup" initial iterations samples = np.asarray(samples[self.num_warmup:]) From 59ef17c86cf28d0f469700fbb594c5d39f8ddb2d Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 20:10:25 +0100 Subject: [PATCH 30/41] ilr only on multiclass --- BayesianKDEy/_bayeisan_kdey.py | 2 +- BayesianKDEy/full_experiments.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 8f77fb1..89f63ad 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -128,7 +128,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): elif self.explore=='ilr': ilr = ILRtransformation() ilr_point = ilr(prev) - dir_noise = rng.normal(scale=step_size, size=d) + dir_noise = rng.normal(scale=step_size, size=d-1) ilr_neighbour = ilr_point + dir_noise neighbour = ilr.inverse(ilr_neighbour) assert in_simplex(neighbour), 'wrong ILR transformation' diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index d821cb5..20958fd 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -49,8 +49,10 @@ def methods(): hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} - only_binary = True - multiclass_method = False + + multiclass_method = 'multiclass' + only_binary = 'only_binary' + only_multiclass = 'only_multiclass' yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method @@ -63,7 +65,7 @@ def methods(): yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method - yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), multiclass_method + yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): @@ -161,8 +163,10 @@ if __name__ == '__main__': is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') - for method_name, method, hyper_params, withconf_constructor, only_binary in methods(): - if only_binary and not is_binary: + for method_name, method, hyper_params, withconf_constructor, method_scope in methods(): + if method_scope == 'only_binary' and not is_binary: + continue + if method_scope == 'only_multiclass' and is_binary: continue result_path = experiment_path(result_subdir, data_name, method_name) hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) From d52fc40d2b24d3e889806c9ee2de42de446882db Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 6 Dec 2025 13:20:11 +0100 Subject: [PATCH 31/41] ilr added --- BayesianKDEy/_bayeisan_kdey.py | 3 +- BayesianKDEy/full_experiments.py | 9 +++ BayesianKDEy/plot_simplex.py | 93 +++++++++++++++++++----------- quapy/functional.py | 98 ++++++++++++++++++++++++++++++++ quapy/method/_kdey.py | 49 ++++++---------- quapy/method/aggregative.py | 2 +- quapy/method/confidence.py | 95 +------------------------------ 7 files changed, 190 insertions(+), 159 deletions(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 89f63ad..f4da22f 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -1,7 +1,8 @@ from sklearn.base import BaseEstimator import numpy as np from quapy.method._kdey import KDEBase -from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation, ILRtransformation +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC +from functional import CLRtransformation, ILRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 20958fd..e592214 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -36,6 +36,14 @@ class KDEyCLR(KDEyML): ) +class KDEyILR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='ilr' + ) + + def methods(): """ Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: @@ -66,6 +74,7 @@ def methods(): yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass + yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index 8ca8e8c..7397a99 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -7,13 +7,37 @@ import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from scipy.stats import gaussian_kde -from method.confidence import ConfidenceIntervals, ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR +from method.confidence import (ConfidenceIntervals as CI, + ConfidenceEllipseSimplex as CE, + ConfidenceEllipseCLR as CLR, + ConfidenceEllipseILR as ILR) + + + +def get_region_colormap(name="blue", alpha=0.40): + name = name.lower() + if name == "blue": + base = (76/255, 114/255, 176/255) + elif name == "orange": + base = (221/255, 132/255, 82/255) + elif name == "violet": + base = (129/255, 114/255, 178/255) + else: + raise ValueError(f"Unknown palette name: {name}") + + cmap = ListedColormap([ + (1, 1, 1, 0), # 0: transparent white + (base[0], base[1], base[2], alpha) # 1: color + ]) + + return cmap def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=None, show_mean=True, show_legend=True, region=None, region_resolution=1000, confine_region_in_simplex=False, + color='blue', save_path=None): plt.rcParams.update({ @@ -49,13 +73,22 @@ def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=No # Plot fig, ax = plt.subplots(figsize=(6, 6)) + if region is not None: + if callable(region): + region_list = [("region", region)] + else: + region_list = region # lista de (name, fn) + if region is not None: # rectangular mesh - xs = np.linspace(0, 1, region_resolution) - ys = np.linspace(0, np.sqrt(3)/2, region_resolution) + x_min, x_max = -0.2, 1.2 + y_min, y_max = -0.2, np.sqrt(3) / 2 + 0.2 + + xs = np.linspace(x_min, x_max, region_resolution) + ys = np.linspace(y_min, y_max, region_resolution) grid_x, grid_y = np.meshgrid(xs, ys) - # 2 barycentric + # barycentric pts_bary = barycentric_from_xy(grid_x, grid_y) # mask within simplex @@ -64,29 +97,23 @@ def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=No else: in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool) - # evaluar la región solo en puntos válidos - mask = np.zeros_like(in_simplex, dtype=float) - valid_pts = pts_bary[in_simplex] + # --- Colormap 0 → blanco, 1 → rojo semitransparente --- - mask_vals = np.array([float(region(p)) for p in valid_pts]) - mask[in_simplex] = mask_vals + # iterar sobre todas las regiones + for (rname, rfun) in region_list: + mask = np.zeros_like(in_simplex, dtype=float) + valid_pts = pts_bary[in_simplex] + mask_vals = np.array([float(rfun(p)) for p in valid_pts]) + mask[in_simplex] = mask_vals - # pintar el fondo + ax.pcolormesh( + xs, ys, mask, + shading='auto', + cmap=get_region_colormap(color), + alpha=0.3, + ) - white_and_color = ListedColormap([ - (1, 1, 1, 1), # color for value 0 - (0.7, .0, .0, .5) # color for value 1 - ]) - - - ax.pcolormesh( - xs, ys, mask, - shading='auto', - cmap=white_and_color, - alpha=0.5 - ) - - ax.scatter(*cartesian(prevs), s=15, alpha=0.5, edgecolors='none', label='samples') + ax.scatter(*cartesian(prevs), s=15, alpha=0.5, edgecolors='none', label='samples', color='black', linewidth=0.5) if show_mean: ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') if train_prev is not None: @@ -96,8 +123,6 @@ def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=No if train_prev is not None: ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black') - - # edges triangle = np.array([v1, v2, v3, v1]) ax.plot(triangle[:, 0], triangle[:, 1], color='black') @@ -179,17 +204,21 @@ if __name__ == '__main__': n = 1000 alpha = [3,5,10] + # alpha = [10,1,1] prevs = np.random.dirichlet(alpha, size=n) def regions(): - yield 'CI', ConfidenceIntervals(prevs) - yield 'CE', ConfidenceEllipseSimplex(prevs) - yield 'CLR', ConfidenceEllipseCLR(prevs) - yield 'ILR', ConfidenceEllipseILR(prevs) + confs = [0.99, 0.95, 0.90] + yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] + yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] + yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] + yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] + yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] - resolution = 100 + resolution = 1000 alpha_str = ','.join([f'{str(i)}' for i in alpha]) for crname, cr in regions(): - plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr.coverage, region_resolution=resolution, + plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr, region_resolution=resolution, + color='blue', save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png') diff --git a/quapy/functional.py b/quapy/functional.py index 5f6b915..30c1e39 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -1,10 +1,15 @@ import warnings +from abc import ABC, abstractmethod from collections import defaultdict +from functools import lru_cache from typing import Literal, Union, Callable from numpy.typing import ArrayLike import scipy import numpy as np +from scipy.special import softmax + +import quapy as qp # ------------------------------------------------------------------------------------------ @@ -649,3 +654,96 @@ def solve_adjustment( raise ValueError(f'unknown {solver=}') +# ------------------------------------------------------------------------------------------ +# Transformations from Compositional analysis +# ------------------------------------------------------------------------------------------ + +class CompositionalTransformation(ABC): + """ + Abstract class of transformations from compositional data. + Basically, callable functions with an "inverse" function. + """ + @abstractmethod + def __call__(self, X): ... + + @abstractmethod + def inverse(self, Z): ... + + EPSILON=1e-6 + + +class CLRtransformation(CompositionalTransformation): + """ + Centered log-ratio (CLR), from compositional analysis + """ + def __call__(self, X): + """ + Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but + actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` + + :param X: np.ndarray of (n_instances, n_dimensions) to be transformed + :param epsilon: small float for prevalence smoothing + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + X = np.asarray(X) + X = qp.error.smooth(X, self.EPSILON) + G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean + return np.log(X / G) + + def inverse(self, Z): + """ + Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. + + :param Z: np.ndarray of (n_instances, n_dimensions) to be transformed + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + return softmax(Z, axis=-1) + + +class ILRtransformation(CompositionalTransformation): + """ + Isometric log-ratio (ILR), from compositional analysis + """ + def __call__(self, X): + X = np.asarray(X) + X = qp.error.smooth(X, self.EPSILON) + k = X.shape[-1] + V = self.get_V(k) # (k-1, k) + logp = np.log(X) + return logp @ V.T + + def inverse(self, Z): + Z = np.asarray(Z) + # get dimension + k_minus_1 = Z.shape[-1] + k = k_minus_1 + 1 + V = self.get_V(k) # (k-1, k) + logp = Z @ V + p = np.exp(logp) + p = p / np.sum(p, axis=-1, keepdims=True) + return p + + @lru_cache(maxsize=None) + def get_V(self, k): + def helmert_matrix(k): + """ + Returns the (k x k) Helmert matrix. + """ + H = np.zeros((k, k)) + for i in range(1, k): + H[i, :i] = 1 + H[i, i] = -(i) + H[i] = H[i] / np.sqrt(i * (i + 1)) + # row 0 stays zeros; will be discarded + return H + + def ilr_basis(k): + """ + Constructs an orthonormal ILR basis using the Helmert submatrix. + Output shape: (k-1, k) + """ + H = helmert_matrix(k) + V = H[1:, :] # remove first row of zeros + return V + + return ilr_basis(k) diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 8475ee7..2f12eb8 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -9,27 +9,13 @@ import quapy.functional as F from sklearn.metrics.pairwise import rbf_kernel -# class KDE(KernelDensity): -# -# KERNELS = ['gaussian', 'aitchison'] -# -# def __init__(self, bandwidth, kernel): -# assert kernel in KDE.KERNELS, f'unknown {kernel=}' -# self.bandwidth = bandwidth -# self.kernel = kernel -# -# def - - - - class KDEBase: """ Common ancestor for KDE-based methods. Implements some common routines. """ BANDWIDTH_METHOD = ['scott', 'silverman'] - KERNELS = ['gaussian', 'aitchison'] + KERNELS = ['gaussian', 'aitchison', 'ilr'] @classmethod @@ -59,18 +45,6 @@ class KDEBase: assert kernel in KDEBase.KERNELS, f'unknown {kernel=}' return kernel - @classmethod - def clr_transform(cls, P, eps=1e-7): - """ - Centered-Log Ratio (CLR) transform. - P: array (n_samples, n_classes), every row is a point in the probability simplex - eps: smoothing, to avoid log(0) - """ - X_safe = np.clip(P, eps, None) - X_safe /= X_safe.sum(axis=1, keepdims=True) # renormalize - gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True)) - return np.log(X_safe / gm) - def get_kde_function(self, X, bandwidth, kernel): """ Wraps the KDE function from scikit-learn. @@ -81,7 +55,9 @@ class KDEBase: :return: a scikit-learn's KernelDensity object """ if kernel == 'aitchison': - X = KDEBase.clr_transform(X) + X = self.clr_transform(X) + elif kernel == 'ilr': + X = self.ilr_transform(X) return KernelDensity(bandwidth=bandwidth).fit(X) @@ -96,7 +72,9 @@ class KDEBase: :return: np.ndarray with the densities """ if kernel == 'aitchison': - X = KDEBase.clr_transform(X) + X = self.clr_transform(X) + elif kernel == 'ilr': + X = self.ilr_transform(X) return np.exp(kde.score_samples(X)) @@ -117,12 +95,19 @@ class KDEBase: if selX.size==0: selX = [F.uniform_prevalence(len(classes))] - # if kernel == 'aitchison': - # this is already done within get_kde_function - # selX = KDEBase.clr_transform(selX) class_cond_X.append(selX) return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X] + def clr_transform(self, X): + if not hasattr(self, 'clr'): + self.clr = F.CLRtransformation() + return self.clr(X) + + def ilr_transform(self, X): + if not hasattr(self, 'ilr'): + self.ilr = F.ILRtransformation() + return self.ilr(X) + class KDEyML(AggregativeSoftQuantifier, KDEBase): """ diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 25fc1ef..d565140 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -673,7 +673,7 @@ class PACC(AggregativeSoftQuantifier): class EMQ(AggregativeSoftQuantifier): """ `Expectation Maximization for Quantification `_ (EMQ), - aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. + aka `Saerens-Latinne-Decaestecker` (SLD) algorithm, or `Maximum Likelihood Label Shif` (MLLS). EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via maximum-likelihood estimation, in a mutually recursive way, until convergence. diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 33a475d..bb7d9cd 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -5,13 +5,14 @@ from sklearn.metrics import confusion_matrix import quapy as qp import quapy.functional as F +from functional import CompositionalTransformation, CLRtransformation, ILRtransformation from quapy.method import _bayesian from quapy.data import LabelledCollection from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier from scipy.stats import chi2 from sklearn.utils import resample from abc import ABC, abstractmethod -from scipy.special import softmax, factorial +from scipy.special import factorial import copy from functools import lru_cache from tqdm import tqdm @@ -218,98 +219,6 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): return float(np.mean(within_ellipse)) -class CompositionalTransformation(ABC): - """ - Abstract class of transformations from compositional data. - Basically, callable functions with an "inverse" function. - """ - @abstractmethod - def __call__(self, X): ... - - @abstractmethod - def inverse(self, Z): ... - - EPSILON=1e-6 - - - -class CLRtransformation(CompositionalTransformation): - """ - Centered log-ratio (CLR), from compositional analysis - """ - def __call__(self, X): - """ - Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but - actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` - - :param X: np.ndarray of (n_instances, n_dimensions) to be transformed - :param epsilon: small float for prevalence smoothing - :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points - """ - X = np.asarray(X) - X = qp.error.smooth(X, self.EPSILON) - G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean - return np.log(X / G) - - def inverse(self, Z): - """ - Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. - - :param Z: np.ndarray of (n_instances, n_dimensions) to be transformed - :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points - """ - return softmax(Z, axis=-1) - - -class ILRtransformation(CompositionalTransformation): - """ - Isometric log-ratio (ILR), from compositional analysis - """ - def __call__(self, X): - X = np.asarray(X) - X = qp.error.smooth(X, self.EPSILON) - k = X.shape[-1] - V = self.get_V(k) # (k-1, k) - logp = np.log(X) - return logp @ V.T - - def inverse(self, Z): - Z = np.asarray(Z) - # get dimension - k_minus_1 = Z.shape[-1] - k = k_minus_1 + 1 - V = self.get_V(k) # (k-1, k) - logp = Z @ V - p = np.exp(logp) - p = p / np.sum(p, axis=-1, keepdims=True) - return p - - @lru_cache(maxsize=None) - def get_V(self, k): - def helmert_matrix(k): - """ - Returns the (k x k) Helmert matrix. - """ - H = np.zeros((k, k)) - for i in range(1, k): - H[i, :i] = 1 - H[i, i] = -(i) - H[i] = H[i] / np.sqrt(i * (i + 1)) - # row 0 stays zeros; will be discarded - return H - - def ilr_basis(k): - """ - Constructs an orthonormal ILR basis using the Helmert submatrix. - Output shape: (k-1, k) - """ - H = helmert_matrix(k) - V = H[1:, :] # remove first row of zeros - return V - - return ilr_basis(k) - - class ConfidenceEllipseSimplex(ConfidenceRegionABC): """ Instantiates a Confidence Ellipse in the probability simplex. From 2e0068b6ae1023459c1373d6e994d39aab71a53c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 6 Dec 2025 13:36:13 +0100 Subject: [PATCH 32/41] import fix --- BayesianKDEy/_bayeisan_kdey.py | 8 ++++-- BayesianKDEy/single_experiment_debug.py | 37 ++++++++++--------------- quapy/method/confidence.py | 2 +- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index f4da22f..3a202d7 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -43,6 +43,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): region: str = 'intervals', explore='simplex', step_size=0.05, + temperature=1., verbose: bool = False): if num_warmup <= 0: @@ -51,6 +52,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): raise ValueError(f'parameter {num_samples=} must be a positive integer') assert explore in ['simplex', 'clr', 'ilr'], \ f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"' + assert temperature>0., f'temperature must be >0' super().__init__(classifier, fit_classifier, val_split) self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) @@ -62,6 +64,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): self.region = region self.explore = explore self.step_size = step_size + self.temperature = temperature self.verbose = verbose def aggregation_fit(self, classif_predictions, labels): @@ -99,7 +102,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): def log_likelihood(prev, epsilon=1e-10): test_likelihoods = prev @ test_densities test_loglikelihood = np.log(test_likelihoods + epsilon) - return np.sum(test_loglikelihood) + return (1./self.temperature) * np.sum(test_loglikelihood) # def log_prior(prev): # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) @@ -167,7 +170,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): recent_accept_rate = np.mean(acceptance_history[-100:]) step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) # step_size = float(np.clip(step_size, min_step, max_step)) - # print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') + if i %100==0: + print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') # remove "warmup" initial iterations samples = np.asarray(samples[self.num_warmup:]) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index fb3be01..78a0d92 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -29,7 +29,7 @@ from time import time from sklearn.base import clone, BaseEstimator -def method(): +def methods(): """ Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: - name: is a str representing the name of the method (e.g., 'BayesianKDEy') @@ -47,48 +47,41 @@ def method(): # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), - return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, - explore=True, - step_size=.15, - # num_warmup = 5000, - # num_samples = 10_000, - # region='ellipse', - **hyper), + for T in [1., 10., 100., 1000.]: + yield (f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, + lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, temperature=T, step_size=.15, **hyper)), if __name__ == '__main__': binary = { - 'datasets': qp.datasets.UCI_BINARY_DATASETS, 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, 'sample_size': 500 } multiclass = { - 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, 'sample_size': 1000 } - result_dir = Path('./results') - setup = multiclass - qp.environ['SAMPLE_SIZE'] = setup['sample_size'] data_name = 'digits' + + qp.environ['SAMPLE_SIZE'] = setup['sample_size'] print(f'dataset={data_name}') data = setup['fetch_fn'](data_name) is_binary = data.n_classes==2 - hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') - method_name, method, hyper_params, withconf_constructor = method() - hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) - report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path) + hyper_subdir = Path('./results') / 'hyperparams' / ('binary' if is_binary else 'multiclass') + for method_name, method, hyper_params, withconf_constructor in methods(): + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path) - print(f'dataset={data_name}, ' - f'method={method_name}: ' - f'mae={report["results"]["ae"].mean():.3f}, ' - f'coverage={report["results"]["coverage"].mean():.5f}, ' - f'amplitude={report["results"]["amplitude"].mean():.5f}, ') + print(f'dataset={data_name}, ' + f'method={method_name}: ' + f'mae={report["results"]["ae"].mean():.3f}, ' + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index bb7d9cd..7a845d8 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -5,7 +5,7 @@ from sklearn.metrics import confusion_matrix import quapy as qp import quapy.functional as F -from functional import CompositionalTransformation, CLRtransformation, ILRtransformation +from quapy.functional import CompositionalTransformation, CLRtransformation, ILRtransformation from quapy.method import _bayesian from quapy.data import LabelledCollection from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier From 7c7af2cfaf0edf5667c0f6649a833dd2361fdfcf Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 6 Dec 2025 13:50:15 +0100 Subject: [PATCH 33/41] added temperature, and coverage increases! --- BayesianKDEy/single_experiment_debug.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index 78a0d92..e9b71b5 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -48,8 +48,7 @@ def methods(): # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), for T in [1., 10., 100., 1000.]: - yield (f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, - lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, temperature=T, step_size=.15, **hyper)), + yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, temperature=T, step_size=.15, **hyper), @@ -66,7 +65,7 @@ if __name__ == '__main__': } setup = multiclass - data_name = 'digits' + data_name = 'isolet' qp.environ['SAMPLE_SIZE'] = setup['sample_size'] print(f'dataset={data_name}') From 84c11d956b04ecc541bffe0f2957cf7972b691b8 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 6 Dec 2025 13:51:49 +0100 Subject: [PATCH 34/41] added temperature, and coverage increases! --- BayesianKDEy/_bayeisan_kdey.py | 3 ++- BayesianKDEy/single_experiment_debug.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 3a202d7..8fb7d64 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -166,7 +166,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): samples.append(current_prev) acceptance_history.append(1. if accepted else 0.) - if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: + # if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: + if i % 10 == 0 and len(acceptance_history) >= 100: recent_accept_rate = np.mean(acceptance_history[-100:]) step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) # step_size = float(np.clip(step_size, min_step, max_step)) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index e9b71b5..ca929eb 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -47,8 +47,8 @@ def methods(): # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), - for T in [1., 10., 100., 1000.]: - yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, temperature=T, step_size=.15, **hyper), + for T in [100., 500, 1000.]: + yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, step_size=.1, **hyper), From 602b89bd2154b8f75d61402ae22897ccf04c386c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 6 Dec 2025 13:53:34 +0100 Subject: [PATCH 35/41] added temperature, and coverage increases! --- BayesianKDEy/_bayeisan_kdey.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 8fb7d64..290779e 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -2,7 +2,7 @@ from sklearn.base import BaseEstimator import numpy as np from quapy.method._kdey import KDEBase from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC -from functional import CLRtransformation, ILRtransformation +from quapy.functional import CLRtransformation, ILRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F From b2750ab6ea99520ca927902115676d7350238b62 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 6 Dec 2025 14:06:14 +0100 Subject: [PATCH 36/41] added temperature, and coverage increases! --- BayesianKDEy/single_experiment_debug.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index ca929eb..3fa4739 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -47,8 +47,8 @@ def methods(): # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), - for T in [100., 500, 1000.]: - yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, step_size=.1, **hyper), + for T in [10, 20, 50, 100., 500]: + yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper), From 7342e57cda5547658e9bb4f55cfd2dbd44ab9af8 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 8 Dec 2025 12:14:43 +0100 Subject: [PATCH 37/41] trying with emcee with no success... --- BayesianKDEy/TODO.txt | 8 +++--- BayesianKDEy/_bayeisan_kdey.py | 34 ++++++++++++++++++++++- BayesianKDEy/generate_results.py | 36 ++++++++++++++++--------- BayesianKDEy/single_experiment_debug.py | 10 ++++--- quapy/functional.py | 2 +- quapy/method/confidence.py | 34 ++++++++++++++++++++++- 6 files changed, 103 insertions(+), 21 deletions(-) diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt index 5198f39..2337fe8 100644 --- a/BayesianKDEy/TODO.txt +++ b/BayesianKDEy/TODO.txt @@ -1,4 +1,6 @@ - Add other methods that natively provide uncertainty quantification methods? -- Explore neighbourhood in the CLR space instead than in the simplex! -- CI con Bonferroni -- \ No newline at end of file +- MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever) +- Implement Interval Score or Winkler Score +- Add (w-hat_w)**2/N measure +- analyze across shift + diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 290779e..5451e2b 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -1,11 +1,16 @@ +from numpy.ma.core import shape from sklearn.base import BaseEstimator import numpy as np + +import quapy.util from quapy.method._kdey import KDEBase from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC from quapy.functional import CLRtransformation, ILRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F +import emcee + class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): @@ -72,7 +77,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): return self def aggregate(self, classif_predictions): - self.prevalence_samples = self._bayesian_kde(classif_predictions, init=None, verbose=self.verbose) + # self.prevalence_samples = self._bayesian_kde(classif_predictions, init=None, verbose=self.verbose) + self.prevalence_samples = self._bayesian_emcee(classif_predictions) return self.prevalence_samples.mean(axis=0) def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): @@ -178,6 +184,32 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): samples = np.asarray(samples[self.num_warmup:]) return samples + def _bayesian_emcee(self, X_probs): + ndim = X_probs.shape[1] + nwalkers = 32 + + f = CLRtransformation() + + def log_likelihood(unconstrained, test_densities, epsilon=1e-10): + prev = f.inverse(unconstrained) + test_likelihoods = prev @ test_densities + test_loglikelihood = np.log(test_likelihoods + epsilon) + return np.sum(test_loglikelihood) + + kdes = self.mix_densities + test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes]) + + # p0 = np.random.normal(nwalkers, ndim) + p0 = F.uniform_prevalence_sampling(ndim, nwalkers) + p0 = f(p0) + sampler = emcee.EnsembleSampler(nwalkers, ndim, log_likelihood, args=[test_densities]) + + state = sampler.run_mcmc(p0, self.num_warmup, skip_initial_state_check=True) + sampler.reset() + sampler.run_mcmc(state, self.num_samples, skip_initial_state_check=True) + samples = sampler.get_chain(flat=True) + samples = f.inverse(samples) + return samples def in_simplex(x): return np.all(x >= 0) and np.isclose(x.sum(), 1) \ No newline at end of file diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index b9b0e4e..cbe1e9f 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -7,6 +7,7 @@ import pandas as pd from glob import glob from pathlib import Path import quapy as qp +from method.confidence import ConfidenceIntervals from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR pd.set_option('display.max_columns', None) @@ -17,13 +18,17 @@ pd.set_option("display.precision", 4) pd.set_option("display.float_format", "{:.4f}".format) -def compute_coverage_amplitude(region_constructor): +def compute_coverage_amplitude(region_constructor, **kwargs): all_samples = results['samples'] all_true_prevs = results['true-prevs'] def process_one(samples, true_prevs): - ellipse = region_constructor(samples) - return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion() + region = region_constructor(samples, **kwargs) + if isinstance(region, ConfidenceIntervals): + winkler = region.mean_winkler_score(true_prevs) + else: + winkler = None + return region.coverage(true_prevs), region.montecarlo_proportion(), winkler out = Parallel(n_jobs=3)( delayed(process_one)(samples, true_prevs) @@ -35,8 +40,8 @@ def compute_coverage_amplitude(region_constructor): ) # unzip results - coverage, amplitude = zip(*out) - return list(coverage), list(amplitude) + coverage, amplitude, winkler = zip(*out) + return list(coverage), list(amplitude), list(winkler) def update_pickle(report, pickle_path, updated_dict:dict): @@ -45,18 +50,20 @@ def update_pickle(report, pickle_path, updated_dict:dict): pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) -def update_pickle_with_region(report, file, conf_name, conf_region_class): +def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwargs): if f'coverage-{conf_name}' not in report: - cov, amp = compute_coverage_amplitude(conf_region_class) + covs, amps, winkler = compute_coverage_amplitude(conf_region_class, **kwargs) update_fields = { - f'coverage-{conf_name}': cov, - f'amplitude-{conf_name}': amp, + f'coverage-{conf_name}': covs, + f'amplitude-{conf_name}': amps, + f'winkler-{conf_name}': winkler } update_pickle(report, file, update_fields) -for setup in ['binary', 'multiclass']: + +for setup in ['multiclass', 'binary']: path = f'./results/{setup}/*.pkl' table = defaultdict(list) for file in tqdm(glob(path), desc='processing results', total=len(glob(path))): @@ -68,13 +75,18 @@ for setup in ['binary', 'multiclass']: table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples) table['dataset'].extend([dataset] * n_samples) table['ae'].extend(results['ae']) - table['c-CI'].extend(results['coverage']) - table['a-CI'].extend(results['amplitude']) + # table['c-CI'].extend(results['coverage']) + # table['a-CI'].extend(results['amplitude']) + update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True) update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex) update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR) update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR) + table['c-CI'].extend(report['coverage-CI']) + table['a-CI'].extend(report['amplitude-CI']) + table['w-CI'].extend(report['winkler-CI']) + table['c-CE'].extend(report['coverage-CE']) table['a-CE'].extend(report['amplitude-CE']) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py index 3fa4739..a33df67 100644 --- a/BayesianKDEy/single_experiment_debug.py +++ b/BayesianKDEy/single_experiment_debug.py @@ -47,8 +47,10 @@ def methods(): # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), - for T in [10, 20, 50, 100., 500]: - yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper), + # for T in [10, 20, 50, 100., 500]: + # yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper), + + yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper), @@ -65,7 +67,9 @@ if __name__ == '__main__': } setup = multiclass - data_name = 'isolet' + # data_name = 'isolet' + # data_name = 'cmc' + data_name = 'abalone' qp.environ['SAMPLE_SIZE'] = setup['sample_size'] print(f'dataset={data_name}') diff --git a/quapy/functional.py b/quapy/functional.py index 30c1e39..29fe137 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -697,7 +697,7 @@ class CLRtransformation(CompositionalTransformation): :param Z: np.ndarray of (n_instances, n_dimensions) to be transformed :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points """ - return softmax(Z, axis=-1) + return scipy.special.softmax(Z, axis=-1) class ILRtransformation(CompositionalTransformation): diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 7a845d8..ee2ec33 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -345,6 +345,11 @@ class ConfidenceIntervals(ConfidenceRegionABC): :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) + :param bonferroni_correction: bool (default False), if True, a Bonferroni correction + is applied to the significance level (`alpha`) before computing confidence intervals. + The correction consists of replacing `alpha` with `alpha/n_classes`. When + `n_classes=2` the correction is not applied because there is only one verification test + since the other class is constrained. This is not necessarily true for n_classes>2. """ def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False): assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)' @@ -355,11 +360,13 @@ class ConfidenceIntervals(ConfidenceRegionABC): alpha = 1-confidence_level if bonferroni_correction: n_classes = samples.shape[-1] - alpha = alpha/n_classes + if n_classes>2: + alpha = alpha/n_classes low_perc = (alpha/2.)*100 high_perc = (1-alpha/2.)*100 self.I_low, self.I_high = np.percentile(samples, q=[low_perc, high_perc], axis=0) self._samples = samples + self.alpha = alpha @property def samples(self): @@ -391,6 +398,31 @@ class ConfidenceIntervals(ConfidenceRegionABC): def __repr__(self): return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']' + @property + def n_dim(self): + return len(self.I_low) + + def winkler_scores(self, true_prev): + true_prev = np.asarray(true_prev) + assert true_prev.ndim == 1, 'unexpected dimensionality for true_prev' + assert len(true_prev)==self.n_dim, \ + f'unexpected number of dimensions; found {true_prev.ndim}, expected {self.n_dim}' + + def winkler_score(low, high, true_val, alpha): + amp = high-low + scale_cost = 1./alpha + cost = np.max([0, low-true_val], axis=0) + np.max([0, true_val-high], axis=0) + return amp + scale_cost*cost + + return np.asarray( + [winkler_score(low_i, high_i, true_v, self.alpha) + for (low_i, high_i, true_v) in zip(self.I_low, self.I_high, true_prev)] + ) + + def mean_winkler_score(self, true_prev): + return np.mean(self.winkler_scores(true_prev)) + + class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): From 7b75954f9bdba7df3c163d0fcfdfa5793ee9e54c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 8 Dec 2025 12:17:54 +0100 Subject: [PATCH 38/41] adding bootstrap-emq --- BayesianKDEy/full_experiments.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index e592214..82876c6 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -10,7 +10,7 @@ from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy from build.lib.quapy.data import LabelledCollection -from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset @@ -54,6 +54,7 @@ def methods(): quantifier with optimized hyperparameters """ acc_hyper = {} + emq_hyper = {'calib': [None, 'nbvs', 'bcts', 'ts', 'vs']} hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} @@ -65,6 +66,8 @@ def methods(): yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method + yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup'), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup'), n_test_samples=1000, random_state=0), multiclass_method + yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary From 7cb4bd550ff09e62528d61fd4531265acbd8a97f Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 8 Dec 2025 12:31:23 +0100 Subject: [PATCH 39/41] adding bootstrap-emq --- BayesianKDEy/full_experiments.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 82876c6..9e03062 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -63,21 +63,21 @@ def methods(): only_binary = 'only_binary' only_multiclass = 'only_multiclass' - yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method - yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method + # yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method + # yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method - yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup'), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup'), n_test_samples=1000, random_state=0), multiclass_method + yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup'), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=None if hyper['calib'] is None else 5z), n_test_samples=1000, random_state=0), multiclass_method - yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method - yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary - - yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method - yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method - yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method - yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method + # yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method + # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary + # + # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method + # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method + # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method + # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method - yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass - yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass + # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass + # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): From c492415977b86464de7169d2bc2568580d3769fb Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 8 Dec 2025 12:40:44 +0100 Subject: [PATCH 40/41] adding bootstrap-emq --- BayesianKDEy/full_experiments.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 9e03062..4ec0fd6 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -54,7 +54,7 @@ def methods(): quantifier with optimized hyperparameters """ acc_hyper = {} - emq_hyper = {'calib': [None, 'nbvs', 'bcts', 'ts', 'vs']} + emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']} hdy_hyper = {'nbins': [3,4,5,8,16,32]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} @@ -66,7 +66,7 @@ def methods(): # yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method # yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method - yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup'), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=None if hyper['calib'] is None else 5z), n_test_samples=1000, random_state=0), multiclass_method + yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method # yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary @@ -151,7 +151,7 @@ def experiment_path(dir:Path, dataset_name:str, method_name:str): if __name__ == '__main__': binary = { - 'datasets': qp.datasets.UCI_BINARY_DATASETS, + 'datasets': qp.datasets.UCI_BINARY_DATASETS[1:], 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, 'sample_size': 500 } From 89a8cad0b3f1bbfc8e05c693a07d75aea39dc72e Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 9 Dec 2025 18:12:06 +0100 Subject: [PATCH 41/41] atichison distance and tests for evaluating regions --- BayesianKDEy/TODO.txt | 3 +- BayesianKDEy/_bayeisan_kdey.py | 4 +- BayesianKDEy/full_experiments.py | 2 +- BayesianKDEy/generate_results.py | 57 +++++- BayesianKDEy/plot_simplex.py | 64 ++++-- CHANGE_LOG.txt | 4 + quapy/error.py | 59 +++++- quapy/method/confidence.py | 335 +++++++++++++++++++++---------- 8 files changed, 392 insertions(+), 136 deletions(-) diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt index 2337fe8..9aea717 100644 --- a/BayesianKDEy/TODO.txt +++ b/BayesianKDEy/TODO.txt @@ -1,6 +1,7 @@ - Add other methods that natively provide uncertainty quantification methods? - MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever) - Implement Interval Score or Winkler Score -- Add (w-hat_w)**2/N measure - analyze across shift +- add Bayesian EM +- optimize also C and class_weight? diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index 5451e2b..e20db79 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -9,7 +9,7 @@ from quapy.functional import CLRtransformation, ILRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F -import emcee +#import emcee @@ -212,4 +212,4 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): return samples def in_simplex(x): - return np.all(x >= 0) and np.isclose(x.sum(), 1) \ No newline at end of file + return np.all(x >= 0) and np.isclose(x.sum(), 1) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 4ec0fd6..0a1dad2 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -151,7 +151,7 @@ def experiment_path(dir:Path, dataset_name:str, method_name:str): if __name__ == '__main__': binary = { - 'datasets': qp.datasets.UCI_BINARY_DATASETS[1:], + 'datasets': qp.datasets.UCI_BINARY_DATASETS, 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, 'sample_size': 500 } diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index cbe1e9f..04dedd9 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -7,8 +7,9 @@ import pandas as pd from glob import glob from pathlib import Path import quapy as qp -from method.confidence import ConfidenceIntervals -from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR +from error import dist_aitchison +from quapy.method.confidence import ConfidenceIntervals +from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC pd.set_option('display.max_columns', None) pd.set_option('display.width', 2000) @@ -18,6 +19,17 @@ pd.set_option("display.precision", 4) pd.set_option("display.float_format", "{:.4f}".format) +def region_score(true_prev, region: ConfidenceRegionABC): + amp = region.montecarlo_proportion(50_000) + if true_prev in region: + cost = 0 + else: + scale_cost = 1/region.alpha + cost = scale_cost * dist_aitchison(true_prev, region.closest_point_in_region(true_prev)) + return amp + cost + + + def compute_coverage_amplitude(region_constructor, **kwargs): all_samples = results['samples'] all_true_prevs = results['true-prevs'] @@ -62,19 +74,24 @@ def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwar update_pickle(report, file, update_fields) +methods = None # show all methods +# methods = ['BayesianACC', 'BayesianKDEy'] -for setup in ['multiclass', 'binary']: +for setup in ['multiclass']: path = f'./results/{setup}/*.pkl' table = defaultdict(list) for file in tqdm(glob(path), desc='processing results', total=len(glob(path))): file = Path(file) dataset, method = file.name.replace('.pkl', '').split('__') + if methods is not None and method not in methods: + continue report = pickle.load(open(file, 'rb')) results = report['results'] n_samples = len(results['ae']) table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples) table['dataset'].extend([dataset] * n_samples) table['ae'].extend(results['ae']) + table['rae'].extend(results['rae']) # table['c-CI'].extend(results['coverage']) # table['a-CI'].extend(results['amplitude']) @@ -96,6 +113,14 @@ for setup in ['multiclass', 'binary']: table['c-ILR'].extend(report['coverage-ILR']) table['a-ILR'].extend(report['amplitude-ILR']) + table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim'])) + # table['aitch-well'].extend(qp.error.dist_aitchison(results['true-prevs'], [ConfidenceEllipseILR(samples).mean_ for samples in results['samples']])) + # table['aitch'].extend() + table['reg-score-ILR'].extend( + [region_score(true_prev, ConfidenceEllipseILR(samples)) for true_prev, samples in zip(results['true-prevs'], results['samples'])] + ) + + df = pd.DataFrame(table) @@ -111,16 +136,30 @@ for setup in ['multiclass', 'binary']: tr_size[dataset] = len(data.training) # remove datasets with more than max_classes classes - max_classes = 30 - for data_name, n in n_classes.items(): - if n > max_classes: - df = df[df["dataset"] != data_name] + # max_classes = 30 + # min_train = 1000 + # for data_name, n in n_classes.items(): + # if n > max_classes: + # df = df[df["dataset"] != data_name] + # for data_name, n in tr_size.items(): + # if n < min_train: + # df = df[df["dataset"] != data_name] - for region in ['CI', 'CE', 'CLR', 'ILR']: + for region in ['ILR']: # , 'CI', 'CE', 'CLR', 'ILR']: if setup == 'binary' and region=='ILR': continue + # pv = pd.pivot_table( + # df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True + # ) pv = pd.pivot_table( - df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True + df, index='dataset', columns='method', values=[ + #f'w-{region}', + # 'ae', + # 'rae', + # f'aitch', + # f'aitch-well' + 'reg-score-ILR', + ], margins=True ) pv['n_classes'] = pv.index.map(n_classes).astype('Int64') pv['tr_size'] = pv.index.map(tr_size).astype('Int64') diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index 7397a99..d0128ba 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -33,7 +33,10 @@ def get_region_colormap(name="blue", alpha=0.40): return cmap -def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=None, show_mean=True, show_legend=True, +def plot_prev_points(samples=None, + show_samples=True, + true_prev=None, + point_estim=None, train_prev=None, show_mean=True, show_legend=True, region=None, region_resolution=1000, confine_region_in_simplex=False, @@ -113,9 +116,17 @@ def plot_prev_points(prevs=None, true_prev=None, point_estim=None, train_prev=No alpha=0.3, ) - ax.scatter(*cartesian(prevs), s=15, alpha=0.5, edgecolors='none', label='samples', color='black', linewidth=0.5) - if show_mean: - ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') + if samples is not None: + if show_samples: + ax.scatter(*cartesian(samples), s=15, alpha=0.5, edgecolors='none', label='samples', color='black', linewidth=0.5) + if show_mean is not None: + if isinstance(show_mean, np.ndarray): + ax.scatter(*cartesian(show_mean), s=10, alpha=1, label='sample-mean', edgecolors='black') + elif show_mean==True and samples is not None: + ax.scatter(*cartesian(samples.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') + else: + raise ValueError(f'show_mean should either be a boolean (if True, then samples must be provided) or ' + f'the mean point itself') if train_prev is not None: ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') if point_estim is not None: @@ -203,22 +214,45 @@ if __name__ == '__main__': np.random.seed(1) n = 1000 - alpha = [3,5,10] - # alpha = [10,1,1] + # alpha = [3,5,10] + alpha = [10,1,1] prevs = np.random.dirichlet(alpha, size=n) def regions(): confs = [0.99, 0.95, 0.90] yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] - yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] - yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] - yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] - yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] + # yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] + # yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] + + # resolution = 1000 + # alpha_str = ','.join([f'{str(i)}' for i in alpha]) + # for crname, cr in regions(): + # plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr, region_resolution=resolution, + # color='blue', + # save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png', + # ) + + + def regions(): + confs = [0.99, 0.95, 0.90] + yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] + # yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] + # yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] resolution = 1000 alpha_str = ','.join([f'{str(i)}' for i in alpha]) - for crname, cr in regions(): - plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr, region_resolution=resolution, - color='blue', - save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png') - + region = ILR(prevs, confidence_level=.99) + p = np.asarray([0.1, 0.8, 0.1]) + plot_prev_points(prevs, show_samples=False, + show_mean=region.mean_, + # show_mean=prevs.mean(axis=0), + show_legend=False, region=[('', region.coverage)], region_resolution=resolution, + color='blue', + true_prev=p, + train_prev=region.closest_point_in_region(p), + save_path=f'./plots3/simplex_ilr.png', + ) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index b6066b9..9761c29 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -1,9 +1,13 @@ Change Log 0.2.1 ----------------- +- Added squared ratio error. +- Improved efficiency of confidence regions coverage functions +- Added Precise Quantifier to WithConfidence methods (a Bayesian adaptation of HDy) - Improved documentation of confidence regions. - Added ReadMe method by Daniel Hopkins and Gary King - Internal index in LabelledCollection is now "lazy", and is only constructed if required. +- I have added dist_aitchison and mean_dist_aitchison as a new error evaluation metric Change Log 0.2.0 ----------------- diff --git a/quapy/error.py b/quapy/error.py index eb42cd6..2407eb0 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -3,6 +3,7 @@ import numpy as np from sklearn.metrics import f1_score import quapy as qp +from functional import CLRtransformation def from_name(err_name): @@ -128,6 +129,59 @@ def se(prevs_true, prevs_hat): return ((prevs_hat - prevs_true) ** 2).mean(axis=-1) +def sre(prevs_true, prevs_hat, prevs_train, eps=0.): + """ + The squared ratio error is defined as + :math:`SRE(p,\\hat{p},p^{tr})=\\frac{1}{|\\mathcal{Y}|}\\sum_{i \\in \\mathcal{Y}}(w_i-\\hat{w}_i)^2`, + where + :math:`w_i=\\frac{p_i}{p^{tr}_i}=\\frac{Q(Y=i)}{P(Y=i)}`, + and `\\hat{w}_i` is the estimate obtained by replacing the true test prior with an estimate, and + :math:`\\mathcal{Y}` are the classes of interest + :param prevs_true: array-like, true prevalence values + :param prevs_hat: array-like, estimated prevalence values + :param prevs_train: array-like, training prevalence values + :param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the + training prevalence is expected to be >0 everywhere. + :return: the squared ratio error + """ + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + prevs_train = np.asarray(prevs_train) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape=} vs {prevs_hat.shape=}' + assert prevs_true.shape[-1]==prevs_train.shape[-1], 'wrong shape for training prevalence' + if eps>0: + prevs_true = smooth(prevs_true, eps) + prevs_hat = smooth(prevs_hat, eps) + prevs_train = smooth(prevs_train, eps) + + N = prevs_true.shape[-1] + w = prevs_true / prevs_train + w_hat = prevs_hat / prevs_train + return (1./N) * (w - w_hat)**2. + + +def msre(prevs_true, prevs_hat, prevs_train, eps=0.): + """ + Returns the mean across all experiments. See :function:`sre`. + :param prevs_true: array-like, true prevalence values of shape (n_experiments, n_classes,) or (n_classes,) + :param prevs_hat: array-like, estimated prevalence values of shape equal to prevs_true + :param prevs_train: array-like, training prevalence values of (n_experiments, n_classes,) or (n_classes,) + :param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the + training prevalence is expected to be >0 everywhere. + :return: the squared ratio error + """ + return np.mean(squared_ratio_error(prevs_true, prevs_hat, prevs_train, eps)) + + +def dist_aitchison(prevs_true, prevs_hat): + clr = CLRtransformation() + return np.linalg.norm(clr(prevs_true) - clr(prevs_hat), axis=-1) + + +def mean_dist_aitchison(prevs_true, prevs_hat): + return np.mean(dist_aitchison(prevs_true, prevs_hat)) + + def mkld(prevs_true, prevs_hat, eps=None): """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the sample pairs. The distributions are smoothed using the `eps` factor @@ -374,8 +428,8 @@ def __check_eps(eps=None): CLASSIFICATION_ERROR = {f1e, acce} -QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld} -QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld} +QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, msre, mean_dist_aitchison} +QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, sre, dist_aitchison} QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae} CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR} QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR} @@ -387,6 +441,7 @@ ERROR_NAMES = \ f1_error = f1e acc_error = acce mean_absolute_error = mae +squared_ratio_error = sre absolute_error = ae mean_relative_absolute_error = mrae relative_absolute_error = rae diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index ee2ec33..914b814 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -88,6 +88,52 @@ class ConfidenceRegionABC(ABC): """ Returns internal samples """ ... + def __contains__(self, p): + """ + Overloads in operator, checks if `p` is contained in the region + + :param p: array-like + :return: boolean + """ + p = np.asarray(p) + assert p.ndim==1, f'unexpected shape for point parameter' + return self.coverage(p)==1. + + def closest_point_in_region(self, p, tol=1e-6, max_iter=30): + """ + Finds the closes point to p that belongs to the region. Assumes the region is convex. + + :param p: array-like, the point + :param tol: float, error tolerance + :param max_iter: int, max number of iterations + :returns: array-like, the closes point to p in the segment between p and the center of the region, that + belongs to the region + """ + p = np.asarray(p, dtype=float) + + # if p in region, returns p itself + if p in self: + return p.copy() + + # center of the region + c = self.point_estimate() + + # binary search in [0,1], interpolation parameter + # low=closest to p, high=closest to c + low, high = 0.0, 1.0 + for _ in range(max_iter): + mid = 0.5 * (low + high) + x = p*(1-mid) + c*mid + if x in self: + high = mid + else: + low = mid + if high - low < tol: + break + + in_boundary = p*(1-high) + c*high + return in_boundary + class WithConfidenceABC(ABC): """ @@ -219,124 +265,61 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): return float(np.mean(within_ellipse)) -class ConfidenceEllipseSimplex(ConfidenceRegionABC): +def closest_point_on_ellipsoid(p, mean, cov, chi2_critical, tol=1e-9, max_iter=100): """ - Instantiates a Confidence Ellipse in the probability simplex. - - :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) - :param confidence_level: float, the confidence level (default 0.95) + Computes the closest point on the ellipsoid defined by: + (x - mean)^T cov^{-1} (x - mean) = chi2_critical """ - def __init__(self, samples, confidence_level=0.95): + p = np.asarray(p) + mean = np.asarray(mean) + Sigma = np.asarray(cov) - assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)' + # Precompute precision matrix + P = np.linalg.pinv(Sigma) + d = P.shape[0] - samples = np.asarray(samples) + # Define v = p - mean + v = p - mean - self.mean_ = samples.mean(axis=0) - self.cov_ = np.cov(samples, rowvar=False, ddof=1) + # If p is inside the ellipsoid, return p itself + M_dist = v @ P @ v + if M_dist <= chi2_critical: + return p.copy() - try: - self.precision_matrix_ = np.linalg.pinv(self.cov_) - except: - self.precision_matrix_ = None + # Function to compute x(lambda) + def x_lambda(lmbda): + A = np.eye(d) + lmbda * P + return mean + np.linalg.solve(A, v) - self.dim = samples.shape[-1] - self.ddof = self.dim - 1 + # Function whose root we want: f(lambda) = Mahalanobis distance - chi2 + def f(lmbda): + x = x_lambda(lmbda) + diff = x - mean + return diff @ P @ diff - chi2_critical - # critical chi-square value - self.confidence_level = confidence_level - self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof) - self._samples = samples + # Bisection search over lambda >= 0 + l_low, l_high = 0.0, 1.0 - @property - def samples(self): - return self._samples + # Increase high until f(high) < 0 + while f(l_high) > 0: + l_high *= 2 + if l_high > 1e12: + raise RuntimeError("Failed to bracket the root.") - def point_estimate(self): - """ - Returns the point estimate, the center of the ellipse. + # Bisection + for _ in range(max_iter): + l_mid = 0.5 * (l_low + l_high) + fm = f(l_mid) + if abs(fm) < tol: + break + if fm > 0: + l_low = l_mid + else: + l_high = l_mid - :return: np.ndarray of shape (n_classes,) - """ - return self.mean_ - - def coverage(self, true_value): - """ - Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the - fraction of these that are contained in the region, if more than one value is passed. If only one value is - passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. - - :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) - :return: float in [0,1] - """ - return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_) - - -class ConfidenceEllipseTransformed(ConfidenceRegionABC): - """ - Instantiates a Confidence Ellipse in a transformed space. - - :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) - :param confidence_level: float, the confidence level (default 0.95) - """ - - def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95): - samples = np.asarray(samples) - self.transformation = transformation - Z = self.transformation(samples) - self.mean_ = np.mean(samples, axis=0) - self.conf_region_z = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) - self._samples = samples - - @property - def samples(self): - return self._samples - - def point_estimate(self): - """ - Returns the point estimate, the center of the ellipse. - - :return: np.ndarray of shape (n_classes,) - """ - # The inverse of the CLR does not coincide with the true mean, because the geometric mean - # requires smoothing the prevalence vectors and this affects the softmax (inverse); - # return self.clr.inverse(self.mean_) # <- does not coincide - return self.mean_ - - def coverage(self, true_value): - """ - Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the - fraction of these that are contained in the region, if more than one value is passed. If only one value is - passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. - - :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) - :return: float in [0,1] - """ - transformed_values = self.transformation(true_value) - return self.conf_region_z.coverage(transformed_values) - - -class ConfidenceEllipseCLR(ConfidenceEllipseTransformed): - """ - Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. - - :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) - :param confidence_level: float, the confidence level (default 0.95) - """ - def __init__(self, samples, confidence_level=0.95): - super().__init__(samples, CLRtransformation(), confidence_level=confidence_level) - - -class ConfidenceEllipseILR(ConfidenceEllipseTransformed): - """ - Instantiates a Confidence Ellipse in the Isometric-Log Ratio (CLR) space. - - :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) - :param confidence_level: float, the confidence level (default 0.95) - """ - def __init__(self, samples, confidence_level=0.95): - super().__init__(samples, ILRtransformation(), confidence_level=confidence_level) + l_opt = l_mid + return x_lambda(l_opt) class ConfidenceIntervals(ConfidenceRegionABC): @@ -423,6 +406,146 @@ class ConfidenceIntervals(ConfidenceRegionABC): return np.mean(self.winkler_scores(true_prev)) +class ConfidenceEllipseSimplex(ConfidenceRegionABC): + """ + Instantiates a Confidence Ellipse in the probability simplex. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + + def __init__(self, samples, confidence_level=0.95): + + assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)' + + samples = np.asarray(samples) + + self.mean_ = samples.mean(axis=0) + self.cov_ = np.cov(samples, rowvar=False, ddof=1) + + try: + self.precision_matrix_ = np.linalg.pinv(self.cov_) + except: + self.precision_matrix_ = None + + self.dim = samples.shape[-1] + self.ddof = self.dim - 1 + + # critical chi-square value + self.confidence_level = confidence_level + self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof) + self._samples = samples + self.alpha = 1.-confidence_level + + @property + def samples(self): + return self._samples + + def point_estimate(self): + """ + Returns the point estimate, the center of the ellipse. + + :return: np.ndarray of shape (n_classes,) + """ + return self.mean_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_) + + def closest_point_in_region(self, p, tol=1e-6, max_iter=30): + return closest_point_on_ellipsoid( + p, + mean=self.mean_, + cov=self.cov_, + chi2_critical=self.chi2_critical_ + ) + + +class ConfidenceEllipseTransformed(ConfidenceRegionABC): + """ + Instantiates a Confidence Ellipse in a transformed space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + + def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95): + samples = np.asarray(samples) + self.transformation = transformation + Z = self.transformation(samples) + # self.mean_ = np.mean(samples, axis=0) + self.mean_ = self.transformation.inverse(np.mean(Z, axis=0)) + self.conf_region_z = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) + self._samples = samples + self.alpha = 1.-confidence_level + + @property + def samples(self): + return self._samples + + def point_estimate(self): + """ + Returns the point estimate, the center of the ellipse. + + :return: np.ndarray of shape (n_classes,) + """ + # The inverse of the CLR does not coincide with the true mean, because the geometric mean + # requires smoothing the prevalence vectors and this affects the softmax (inverse); + # return self.clr.inverse(self.mean_) # <- does not coincide + return self.mean_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + transformed_values = self.transformation(true_value) + return self.conf_region_z.coverage(transformed_values) + + def closest_point_in_region(self, p, tol=1e-6, max_iter=30): + p_prime = self.transformation(p) + b_prime = self.conf_region_z.closest_point_in_region(p_prime, tol=tol, max_iter=max_iter) + b = self.transformation.inverse(b_prime) + return b + + +class ConfidenceEllipseCLR(ConfidenceEllipseTransformed): + """ + Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, samples, confidence_level=0.95): + super().__init__(samples, CLRtransformation(), confidence_level=confidence_level) + + +class ConfidenceEllipseILR(ConfidenceEllipseTransformed): + """ + Instantiates a Confidence Ellipse in the Isometric-Log Ratio (CLR) space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, samples, confidence_level=0.95): + super().__init__(samples, ILRtransformation(), confidence_level=confidence_level) + + + + + class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):