adding lequa datasets

2026-01-20 10:53:16 +01:00 · 2026-01-20 10:53:16 +01:00 · a6336218e2
parent 9ae65ab09a
commit a6336218e2
7 changed files with 313 additions and 205 deletions
--- a/BayesianKDEy/TODO.txt
+++ b/BayesianKDEy/TODO.txt
@ -1,4 +1,13 @@
- Add other methods that natively provide uncertainty quantification methods? (e.g., Ratio estimator, Card & Smith)
+- Things to try:
+    - init chain helps? [seems irrelevant in MAPLS...]
+    - Aitchison kernel is better?
+    - other classifiers?
+    - optimize classifier?
+    - use all datasets?
+    - improve KDE on wine-quality?
+- Add other methods that natively provide uncertainty quantification methods?
+    Ratio estimator
+    Card & Smith
 - MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever)
 - Implement Interval Score or Winkler Score
 - analyze across shift
--- a/BayesianKDEy/_bayeisan_kdey.py
+++ b/BayesianKDEy/_bayeisan_kdey.py
@ -59,7 +59,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
                 temperature=1.,
                 engine='numpyro',
                 prior='uniform',
-                 verbose: bool = False):
+                 verbose: bool = False, 
+                 **kwargs):

        if num_warmup <= 0:
            raise ValueError(f'parameter {num_warmup=} must be a positive integer')
@ -74,6 +75,9 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
        assert engine in ['rw-mh', 'emcee', 'numpyro']

        super().__init__(classifier, fit_classifier, val_split)
+        assert all(k.startswith('classifier__') for k in kwargs.keys()), 'unexpected kwargs; must start with "classifier__"'
+        self.classifier.set_params(**{k.replace('classifier__', ''):v for k,v in kwargs.items()})  # <- improve
+
        self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
        self.kernel = self._check_kernel(kernel)
        self.num_warmup = num_warmup
--- a/BayesianKDEy/_bayesian_mapls.py
+++ b/BayesianKDEy/_bayesian_mapls.py
@ -39,6 +39,7 @@ class BayesianMAPLS(AggregativeSoftQuantifier, WithConfidenceABC):
                 region: str = 'intervals',
                 temperature=1.,
                 prior='uniform',
+                 mapls_chain_init=True,
                 verbose=False
                 ):

@ -53,6 +54,7 @@ class BayesianMAPLS(AggregativeSoftQuantifier, WithConfidenceABC):
        self.region = region
        self.temperature = temperature
        self.prior = prior
+        self.mapls_chain_init = mapls_chain_init
        self.verbose = verbose

    def aggregation_fit(self, classif_predictions, labels):
@ -74,7 +76,7 @@ class BayesianMAPLS(AggregativeSoftQuantifier, WithConfidenceABC):
            return_lambda=True
        )

-        # pi_star: MAP in simplex (shape: [K]), convert to ILR space
+        # pi_star: MAP in simplex shape (n_classes,) and convert to ILR space
        z0 = self.ilr(pi_star)

        if self.prior == 'uniform':
@ -107,7 +109,7 @@ class BayesianMAPLS(AggregativeSoftQuantifier, WithConfidenceABC):
            random.PRNGKey(self.mcmc_seed),
            test_posteriors=classif_predictions,
            alpha=alpha,
-            init_params={"z": z0}
+            init_params={"z": z0} if self.mapls_chain_init else None
        )

        samples = mcmc.get_samples()["z"]
--- a/BayesianKDEy/commons.py
+++ b/BayesianKDEy/commons.py
@ -14,11 +14,134 @@ import numpy as np
 from method.aggregative import KDEyML
 from quapy.functional import l1_norm, ILRtransformation
 from scipy.stats import entropy
+from abc import ABC, abstractmethod


+FINEGRAINED = True
+RESULT_DIR = Path('results_finegrained') if FINEGRAINED else Path('results')

-def fetch_UCI_multiclass(data_name):
-    return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01)
+
+class DatasetHandler(ABC):
+
+    def __init__(self, name:str, sample_size:int):
+        self._name = name
+        self._sample_size = sample_size
+
+    @abstractmethod
+    def get_training(self): ...
+
+    @abstractmethod
+    def get_train_testprot_for_eval(self): ...
+
+    @abstractmethod
+    def get_train_valprot_for_modsel(self): ...
+
+    def sample_size(self):
+        return self._sample_size
+
+    def name(self):
+        return self._name
+
+    @classmethod
+    @abstractmethod
+    def iter(cls): ...
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+    @classmethod
+    @abstractmethod
+    def is_binary(self):
+        ...
+
+
+class UCIMulticlassHandler(DatasetHandler):
+
+    DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS.copy()
+
+    def __init__(self, name, n_val_samples=100, n_test_samples=100):
+        super().__init__(name, sample_size=1000)
+        self._dataset = None  # lazy fetch
+        self.n_val_samples = n_val_samples
+        self.n_test_samples = n_test_samples
+
+    def get_training(self):
+        return self.dataset().training
+
+    def get_train_testprot_for_eval(self):
+        training, test = self.dataset().train_test
+        test_generator = qp.protocol.UPP(test, repeats=self.n_test_samples, random_state=0)
+        return training, test_generator
+
+    def get_train_valprot_for_modsel(self):
+        training = self.dataset().training
+        training, val = training.split_stratified(train_prop=0.6, random_state=0)
+        val_generator = qp.protocol.UPP(val, repeats=self.n_val_samples, random_state=0)
+        return training, val_generator
+
+    @lru_cache(maxsize=None)
+    def dataset(self):
+        if self._dataset is None:
+            self._dataset = qp.datasets.fetch_UCIMulticlassDataset(self.name(), min_class_support=0.01)
+        return self._dataset
+
+    def __repr__(self):
+        return "" # self.dataset().__repr__()
+
+    @classmethod
+    def iter(cls):
+        for name in cls.DATASETS:
+            yield cls(name)
+
+    @classmethod
+    def is_binary(self):
+        return False
+
+
+class LeQuaHandler(DatasetHandler):
+
+    DATASETS = ['LeQua2022', 'LeQua2024']
+
+    def __init__(self, name):
+        super().__init__(name, sample_size=1000)
+        self._dataset = None  # lazy fetch
+
+    def get_training(self):
+        return self.dataset()[0]
+
+    def get_train_testprot_for_eval(self):
+        training, _, test_generator = self.dataset()
+        return training, test_generator
+
+    def get_train_valprot_for_modsel(self):
+        training, val_generator, _ = self.dataset()
+        return training, val_generator
+
+    @lru_cache(maxsize=None)
+    def dataset(self):
+        if self._dataset is None:
+            if self.name()=='LeQua2022':
+                self._dataset = qp.datasets.fetch_lequa2022(task='T1B')
+            elif self.name()=='LeQua2024':
+                self._dataset = qp.datasets.fetch_lequa2024(task='T2')
+            else:
+                raise ValueError(f'unexpected dataset name {self.name()}; valid ones are {self.DATASETS}')
+        return self._dataset
+
+    def __repr__(self):
+        return self.dataset().__repr__()
+
+    @classmethod
+    def iter(cls):
+        for name in cls.DATASETS:
+            yield cls(name)
+
+    @classmethod
+    def is_binary(self):
+        return False
+
+# def fetch_UCI_multiclass(data_name):
+#     return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01)


 def fetch_UCI_binary(data_name):
@ -32,18 +155,20 @@ binary = {
    'sample_size': 500
 }

-multiclass = {
-    'datasets': qp.datasets.UCI_MULTICLASS_DATASETS.copy(),
-    'fetch_fn': fetch_UCI_multiclass,
-    'sample_size': 1000
-}
-try:
-    multiclass['datasets'].remove('poker_hand')  # random performance
-    multiclass['datasets'].remove('hcv')  # random performance
-    multiclass['datasets'].remove('letter')  # many classes
-    multiclass['datasets'].remove('isolet')  # many classes
-except ValueError:
-    pass
+# multiclass = {
+#     'datasets': qp.datasets.UCI_MULTICLASS_DATASETS.copy(),
+#     'fetch_fn': fetch_UCI_multiclass,
+#     'sample_size': 1000
+# }
+# try:
+#     multiclass['datasets'].remove('poker_hand')  # random performance
+#     multiclass['datasets'].remove('hcv')  # random performance
+#     multiclass['datasets'].remove('letter')  # many classes
+#     multiclass['datasets'].remove('isolet')  # many classes
+# except ValueError:
+#     pass
+
+


 # utils
--- a/BayesianKDEy/full_experiments.py
+++ b/BayesianKDEy/full_experiments.py
@ -5,13 +5,13 @@ from copy import deepcopy as cp
 import quapy as qp
 from BayesianKDEy._bayeisan_kdey import BayesianKDEy
 from BayesianKDEy._bayesian_mapls import BayesianMAPLS
-from BayesianKDEy.commons import multiclass, experiment_path, KDEyCLR
+from BayesianKDEy.commons import experiment_path, KDEyCLR, FINEGRAINED, RESULT_DIR, DatasetHandler, \
+    UCIMulticlassHandler, LeQuaHandler
 from BayesianKDEy.temperature_calibration import temp_calibration
 from build.lib.quapy.data import LabelledCollection
 from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC
 from quapy.model_selection import GridSearchQ
 from quapy.data import Dataset
-# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
 from quapy.method.confidence import BayesianCC, AggregativeBootstrap
 from quapy.method.aggregative import KDEyML, ACC
 from quapy.protocol import UPP
@ -21,6 +21,7 @@ from collections import defaultdict
 from time import time


+
 def methods():
    """
    Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
@ -30,68 +31,58 @@ def methods():
    - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
        quantifier with optimized hyperparameters
    """
-    acc_hyper = {}
-    emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']}
-    hdy_hyper = {'nbins': [3,4,5,8,16,32]}
-    kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
-    kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
+    if FINEGRAINED:
+        lr_hyper = {'classifier__C': np.logspace(-4,4,9), 'classifier__class_weight': ['balanced', None]}
+        acc_hyper = lr_hyper
+        emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs'], **lr_hyper}
+        hdy_hyper = {'nbins': [3,4,5,8,16,32], **lr_hyper}
+        kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **lr_hyper}
+        kdey_hyper_clr = {'bandwidth': np.logspace(-2, 2, 10), **lr_hyper}
+    else:
+        acc_hyper = {}
+        emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']}
+        hdy_hyper = {'nbins': [3,4,5,8,16,32]}
+        kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
+        kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
+

    multiclass_method = 'multiclass'
    only_binary = 'only_binary'
    only_multiclass = 'only_multiclass'

-    yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method
-    yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method
+    # Bootstrap approaches:
+    # --------------------------------------------------------------------------------------------------------
+    #yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method
+    #yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method
+    #yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method
+    #yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method

-    yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method
-
-    yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method
+    # Bayesian approaches:
+    # --------------------------------------------------------------------------------------------------------
+    # yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method
    # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary
    #
-    yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
-    # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
-    # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method
-    # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method
-    # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method
-    # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass
-    # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass
-    # yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method
-    # yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
+    #yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
    # yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method
    # yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
-    # yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro',  **hyper), multiclass_method
-    # yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
-    yield f'BaKDE-Ait-numpyro-T*-U', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, prior='uniform', **hyper), multiclass_method
-    # yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method
-    # yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method
-    # yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
-    # yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
-    # yield 'BayEMQ', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*2', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*2T*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=None, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*2T01', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=0.1, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*2T10000', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=10000, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*2T100000', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map2', temperature=100000,
-    #                                                                          exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ-U-Temp1-2', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=1, exact_train_prev=True), multiclass_method
-    yield 'BayEMQ-U-Temp*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=None, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*Temp1', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=1, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*Temp10', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=10, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*Temp100', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=100, exact_train_prev=True), multiclass_method
-    # yield 'BayEMQ*Temp1000', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='map', temperature=1000, exact_train_prev=True), multiclass_method
+    #yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(LR(), kernel='aitchison', mcmc_seed=0, engine='numpyro',  **hyper), multiclass_method
+    # yield f'BaKDE-Gau-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(LR(), kernel='gaussian', mcmc_seed=0, engine='numpyro',  **hyper), multiclass_method
+    # yield f'BaKDE-Ait-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(LR(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
+    # yield f'BaKDE-Gau-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(LR(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
+    yield 'BayEMQ-U-Temp1-2', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=1, exact_train_prev=True), multiclass_method
+    yield 'BayEMQ-T*', CC(LR()), acc_hyper, lambda hyper: BayesianMAPLS(LR(), prior='uniform', temperature=None, exact_train_prev=True), multiclass_method


-def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
+def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict):
    with qp.util.temp_seed(0):
        print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
        # model selection
        if len(grid)>0:
-            train, val  = train.split_stratified(train_prop=0.6, random_state=0)
+            train, val_prot = dataset.get_train_valprot_for_modsel()
            mod_sel = GridSearchQ(
                model=point_quantifier,
                param_grid=grid,
-                protocol=qp.protocol.UPP(val, repeats=250, random_state=0),
+                protocol=val_prot,
                refit=False,
                n_jobs=-1,
                verbose=True
@ -103,46 +94,51 @@ def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuan
        return best_params


-def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path):
-    with qp.util.temp_seed(0):
+def temperature_calibration(dataset: DatasetHandler, uncertainty_quantifier):
+    if hasattr(uncertainty_quantifier, 'temperature') and uncertainty_quantifier.temperature is None:
+        print('calibrating temperature')
+        train, val_prot = dataset.get_train_valprot_for_modsel()
+        temperature = temp_calibration(uncertainty_quantifier, train, val_prot, n_jobs=-1)
+        uncertainty_quantifier.temperature = temperature

-        training, test = dataset.train_test
+
+def experiment(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, uncertainty_quant_constructor, hyper_choice_path: Path):
+
+    with qp.util.temp_seed(0):

        # model selection
        best_hyperparams = qp.util.pickled_resource(
-            hyper_choice_path, model_selection, training, cp(point_quantifier), grid
+            hyper_choice_path, model_selection, dataset, cp(point_quantifier), grid
        )

        t_init = time()
-        withconf_quantifier = withconf_constructor(best_hyperparams)
-        if hasattr(withconf_quantifier, 'temperature') and withconf_quantifier.temperature is None:
-            print('calibrating temperature')
-            train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
-            temperature = temp_calibration(withconf_quantifier, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1)
-            withconf_quantifier.temperature = temperature
-        withconf_quantifier.fit(*training.Xy)
+        uncertainty_quantifier = uncertainty_quant_constructor(best_hyperparams)
+        temperature_calibration(dataset, uncertainty_quantifier)
+        training, test_generator = dataset.get_train_testprot_for_eval()
+        uncertainty_quantifier.fit(*training.Xy)
        tr_time = time() - t_init

        # test
        train_prevalence = training.prevalence()
        results = defaultdict(list)
-        test_generator = UPP(test, repeats=100, random_state=0)
        pbar = tqdm(enumerate(test_generator()), total=test_generator.total())
        for i, (sample_X, true_prevalence) in pbar:
            t_init = time()
-            point_estimate, region = withconf_quantifier.predict_conf(sample_X)
+            point_estimate, region = uncertainty_quantifier.predict_conf(sample_X)
            ttime = time()-t_init
+
            results['true-prevs'].append(true_prevalence)
            results['point-estim'].append(point_estimate)
            results['shift'].append(qp.error.ae(true_prevalence, train_prevalence))
            results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate))
            results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate))
+            results['sre'].append(qp.error.sre(prevs_true=true_prevalence, prevs_hat=point_estimate, prevs_train=train_prevalence))
            results['coverage'].append(region.coverage(true_prevalence))
            results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000))
            results['test-time'].append(ttime)
            results['samples'].append(region.samples)

-            pbar.set_description(f'{method_name} MAE={np.mean(results["ae"]):.5f} Cov={np.mean(results["coverage"]):.5f} AMP={np.mean(results["amplitude"]):.5f}')
+            pbar.set_description(f'{method_name} MAE={np.mean(results["ae"]):.5f} W={np.mean(results["sre"]):.5f} Cov={np.mean(results["coverage"]):.5f} AMP={np.mean(results["amplitude"]):.5f}')

        report = {
            'optim_hyper': best_hyperparams,
@ -154,34 +150,40 @@ def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method
        return report


+def check_skip_experiment(method_scope, dataset: DatasetHandler):
+    if method_scope == 'only_binary' and not dataset.is_binary():
+        return True
+    if method_scope == 'only_multiclass' and dataset.is_binary():
+        return True
+    return False
+
+
 if __name__ == '__main__':

-    result_dir = Path('./results')
+    result_dir = RESULT_DIR
+
+    for data_handler in [LeQuaHandler]:#, UCIMulticlassHandler]:
+        for dataset in data_handler.iter():
+            qp.environ['SAMPLE_SIZE'] = dataset.sample_size()
+            print(f'dataset={dataset}')
+
+            problem_type = 'binary' if dataset.is_binary() else 'multiclass'

-    for setup in [multiclass]: # [binary, multiclass]:
-        qp.environ['SAMPLE_SIZE'] = setup['sample_size']
-        for data_name in setup['datasets']:
-            print(f'dataset={data_name}')
-            # if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
-            #     print(f'skipping dataset: {data_name}')
-            #     continue
-            data = setup['fetch_fn'](data_name)
-            is_binary = data.n_classes==2
-            result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
-            hyper_subdir  = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
            for method_name, surrogate_quant, hyper_params, withconf_constructor, method_scope in methods():
-                if method_scope == 'only_binary' and not is_binary:
+                if check_skip_experiment(method_scope, dataset):
                    continue
-                if method_scope == 'only_multiclass' and is_binary:
-                    continue
-                result_path = experiment_path(result_subdir, data_name, method_name)
-                hyper_path  = experiment_path(hyper_subdir, data_name, surrogate_quant.__class__.__name__)
+
+                result_path = experiment_path(result_dir / problem_type, dataset.name(), method_name)
+                hyper_path  = experiment_path(result_dir / 'hyperparams' / problem_type, dataset.name(), surrogate_quant.__class__.__name__)
+
                report = qp.util.pickled_resource(
-                    result_path, experiment, data, surrogate_quant, method_name, hyper_params, withconf_constructor, hyper_path
+                    result_path, experiment, dataset, surrogate_quant, method_name, hyper_params, withconf_constructor, hyper_path
                )
-                print(f'dataset={data_name}, '
+
+                print(f'dataset={dataset}, '
                      f'method={method_name}: '
                      f'mae={report["results"]["ae"].mean():.5f}, '
+                      f'W={report["results"]["sre"].mean():.5f}, '
                      f'coverage={report["results"]["coverage"].mean():.5f}, '
                      f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

--- a/BayesianKDEy/generate_results.py
+++ b/BayesianKDEy/generate_results.py
@ -7,10 +7,11 @@ import pandas as pd
 from glob import glob
 from pathlib import Path
 import quapy as qp
-from BayesianKDEy.commons import fetch_UCI_binary, fetch_UCI_multiclass
+from BayesianKDEy.commons import RESULT_DIR, UCIMulticlassHandler
 from error import dist_aitchison
 from quapy.method.confidence import ConfidenceIntervals
 from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
+import quapy.functional as F

 pd.set_option('display.max_columns', None)
 pd.set_option('display.width', 2000)
@ -20,6 +21,31 @@ pd.set_option("display.precision", 4)
 pd.set_option("display.float_format", "{:.4f}".format)


+# methods = None  # show all methods
+methods = ['BayesianACC',
+           #'BayesianKDEy',
+           #'BaKDE-emcee',
+           # 'BaKDE-numpyro',
+           # 'BaKDE-numpyro-T2',
+           # 'BaKDE-numpyro-T10',
+           # 'BaKDE-numpyro-T*',
+           'BaKDE-Ait-numpyro',
+           'BaKDE-Ait-T*',
+           'BaKDE-Gau-numpyro',
+           'BaKDE-Gau-T*',
+           'BayEMQ-U-Temp1-2',
+           'BayEMQ-T*',
+           #'BayEMQ-NoInit',
+           #'BayEMQ-U-Temp*',
+           # 'BayEMQ*2Temp1',
+           # 'BayEMQ*2Temp*'
+
+           # 'BootstrapACC',
+           # 'BootstrapHDy',
+           # 'BootstrapKDEy',
+           # 'BootstrapEMQ'
+           ]
+
 def region_score(true_prev, region: ConfidenceRegionABC):
    amp = region.montecarlo_proportion(50_000)
    if true_prev in region:
@ -80,26 +106,6 @@ def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwar
        update_pickle(report, file, update_fields)


-# methods = None  # show all methods
-methods = ['BayesianACC', #'BayesianKDEy',
-           #'BaKDE-emcee',
-           # 'BaKDE-numpyro',
-           # 'BaKDE-numpyro-T2',
-           # 'BaKDE-numpyro-T10',
-           # 'BaKDE-numpyro-T*',
-           # 'BaKDE-Ait-numpyro',
-           # 'BaKDE-Ait-numpyro-T*',
-           'BaKDE-Ait-numpyro-T*-U',
-           'BayEMQ-U-Temp1-2',
-           'BayEMQ-U-Temp*',
-           # 'BayEMQ*2Temp1',
-           # 'BayEMQ*2Temp*'
-
-           # 'BootstrapACC',
-           # 'BootstrapHDy',
-           # 'BootstrapKDEy',
-           # 'BootstrapEMQ'
-           ]

 def nicer(name:str):
    replacements = {
@ -112,14 +118,19 @@ def nicer(name:str):
        name = name.replace(k,v)
    return name

-for setup in ['multiclass']:
-    path = f'./results/{setup}/*.pkl'
+
+base_dir = RESULT_DIR
+
+for dataset_handler in [UCIMulticlassHandler]:
+    problem_type = 'binary' if dataset_handler.is_binary() else 'multiclass'
+    path = f'./{base_dir}/{problem_type}/*.pkl'
    table = defaultdict(list)
    for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
        file = Path(file)
        dataset, method = file.name.replace('.pkl', '').split('__')
-        if methods is not None and method not in methods:
+        if method not in methods:
            continue
+
        report = pickle.load(open(file, 'rb'))
        results = report['results']
        n_samples = len(results['ae'])
@ -166,53 +177,42 @@ for setup in ['multiclass']:

    n_classes = {}
    tr_size   = {}
-    for dataset in df['dataset'].unique():
-        fetch_fn = {
-            'binary': fetch_UCI_binary,
-            'multiclass': fetch_UCI_multiclass
-        }[setup]
-        data = fetch_fn(dataset)
-        n_classes[dataset] = data.n_classes
-        tr_size[dataset] = len(data.training)
+    tr_prev   = {}
+    for dataset in dataset_handler.iter():
+        train = dataset.get_training()
+        n_classes[dataset] = train.n_classes
+        tr_size[dataset] = len(train)
+        tr_prev[dataset] = F.strprev(train.prevalence())

    # remove datasets with more than max_classes classes
-    max_classes = 25
-    min_train   = 500
-    ignore_datasets = ['poker_hand', 'hcv']
-    for data_name, n in n_classes.items():
-        if n > max_classes:
-            df = df[df["dataset"] != data_name]
-    for data_name, n in tr_size.items():
-        if n < min_train:
-            df = df[df["dataset"] != data_name]
-    for data_name, n in tr_size.items():
-        if data_name in ignore_datasets:
-            df = df[df["dataset"] != data_name]
+    # max_classes = 25
+    # min_train   = 500
+    # ignore_datasets = ['poker_hand', 'hcv']
+    # for data_name, n in n_classes.items():
+    #     if n > max_classes:
+    #         df = df[df["dataset"] != data_name]
+    # for data_name, n in tr_size.items():
+    #     if n < min_train:
+    #         df = df[df["dataset"] != data_name]
+    # for data_name, n in tr_size.items():
+    #     if data_name in ignore_datasets:
+    #         df = df[df["dataset"] != data_name]

    for region in ['CI']: #, 'CLR', 'ILR', 'CI']:
-        if setup == 'binary' and region=='ILR':
+        if problem_type == 'binary' and region=='ILR':
            continue
        # pv = pd.pivot_table(
        #     df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True
        # )
-        pv = pd.pivot_table(
-            df, index='dataset', columns='method', values=[
-                # f'amperr-{region}',
-                # f'a-{region}',
-                f'c-{region}',
-                # f'w-{region}',
-                'ae',
-                'SRE',
-                # 'rae',
-                # f'aitch',
-                # f'aitch-well'
-                # 'reg-score-ILR',
-            ], margins=True
-        )
-        pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
-        pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
-        pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
-        print(f'{setup=}')
-        print(pv)
-        print('-'*80)
+        for column in [f'a-{region}', f'c-{region}', 'ae', 'SRE']:
+            pv = pd.pivot_table(
+                df, index='dataset', columns='method', values=column, margins=True
+            )
+            pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
+            pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
+            #pv['tr-prev'] = pv.index.map(tr_prev)
+            pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
+            print(f'{problem_type=} {column=}')
+            print(pv)
+            print('-'*80)

--- a/BayesianKDEy/temperature_calibration.py
+++ b/BayesianKDEy/temperature_calibration.py
@ -1,6 +1,6 @@
 from build.lib.quapy.data import LabelledCollection
 from quapy.method.confidence import WithConfidenceABC
-from quapy.protocol import UPP
+from quapy.protocol import AbstractProtocol
 import numpy as np
 from tqdm import tqdm
 import quapy as qp
@ -8,16 +8,12 @@ from joblib import Parallel, delayed
 import copy


-
-
 def temp_calibration(method:WithConfidenceABC,
                     train:LabelledCollection,
-                     val:LabelledCollection,
+                     val_prot:AbstractProtocol,
                     temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.],
-                     num_samples=100,
                     nominal_coverage=0.95,
                     amplitude_threshold='auto',
-                     random_state=0,
                     n_jobs=1,
                     verbose=True):

@ -31,41 +27,7 @@ def temp_calibration(method:WithConfidenceABC,
    if isinstance(amplitude_threshold, float) and amplitude_threshold > 0.1:
        print(f'warning: the {amplitude_threshold=} is too large; this may lead to uninformative regions')

-
-    method.fit(*train.Xy)
-    label_shift_prot = UPP(val, repeats=num_samples, random_state=random_state)
-
-    # results = []
-    # temp_grid = sorted(temp_grid)
-    # for temp in temp_grid:
-    #     method.temperature = temp
-    #     coverage = 0
-    #     amplitudes = []
-    #     errs = []
-    #     pbar = tqdm(enumerate(label_shift_prot()), total=label_shift_prot.total(), disable=not verbose)
-    #     for i, (sample, prev) in pbar:
-    #         point_estim, conf_region = method.predict_conf(sample)
-    #         if prev in conf_region:
-    #             coverage += 1
-    #         amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
-    #         errs.append(qp.error.mae(prev, point_estim))
-    #         if verbose:
-    #             pbar.set_description(
-    #                 f'temperature={temp:.2f}, '
-    #                 f'coverage={coverage/(i+1):.4f}, '
-    #                 f'amplitude={np.mean(amplitudes):.4f},'
-    #                 f'mae={np.mean(errs):.4f}'
-    #             )
-    #
-    #     mean_coverage = coverage / label_shift_prot.total()
-    #     mean_amplitude = np.mean(amplitudes)
-    #
-    #     if mean_amplitude < amplitude_threshold:
-    #         results.append((temp, mean_coverage, mean_amplitude))
-    #     else:
-    #         break
-
-    def evaluate_temperature(temp):
+    def evaluate_temperature_job(temp):
        local_method = copy.deepcopy(method)
        local_method.temperature = temp

@ -73,7 +35,7 @@ def temp_calibration(method:WithConfidenceABC,
        amplitudes = []
        errs = []

-        for i, (sample, prev) in enumerate(label_shift_prot()):
+        for i, (sample, prev) in enumerate(val_prot()):
            point_estim, conf_region = local_method.predict_conf(sample)

            if prev in conf_region:
@ -82,15 +44,19 @@ def temp_calibration(method:WithConfidenceABC,
            amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
            errs.append(qp.error.mae(prev, point_estim))

-        mean_coverage = coverage / label_shift_prot.total()
+        mean_coverage = coverage / val_prot.total()
        mean_amplitude = np.mean(amplitudes)

+        if verbose:
+            print(f'Temperature={temp} got coverage={mean_coverage*100:.2f}% amplitude={mean_amplitude*100:.2f}%')
+
        return temp, mean_coverage, mean_amplitude

    temp_grid = sorted(temp_grid)
+    method.fit(*train.Xy)

    raw_results = Parallel(n_jobs=n_jobs, backend="loky")(
-        delayed(evaluate_temperature)(temp)
+        delayed(evaluate_temperature_job)(temp)
        for temp in tqdm(temp_grid, disable=not verbose)
    )
    results = [