calibrating temperature and coming back to Aitchison kernel

2026-01-09 17:20:05 +01:00 · 2026-01-09 17:20:05 +01:00 · ae9503a43b
parent e33b291357
commit ae9503a43b
10 changed files with 376 additions and 46 deletions
--- a/BayesianKDEy/TODO.txt
+++ b/BayesianKDEy/TODO.txt
@ -1,7 +1,23 @@
- Add other methods that natively provide uncertainty quantification methods?
+- Add other methods that natively provide uncertainty quantification methods? (e.g., Ratio estimator, Card & Smith)
 - MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever)
 - Implement Interval Score or Winkler Score
 - analyze across shift
- add Bayesian EM
+- add Bayesian EM:
- optimize also C and class_weight?
+    - https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/mapls.py
    - take this opportunity to add RLLS: https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/rlls.py
 - add CIFAR10 and MNIST? Maybe consider also previously tested types of shift (tweak-one-out, etc.)? from RLLS paper
    - https://github.com/Angie-Liu/labelshift/tree/master
    - https://github.com/Angie-Liu/labelshift/blob/master/cifar10_for_labelshift.py
    - Note: MNIST is downloadable from https://archive.ics.uci.edu/dataset/683/mnist+database+of+handwritten+digits
 - consider prior knowledge in experiments:
    - One scenario in which our prior is uninformative (i.e., uniform)
    - One scenario in which our prior is wrong (e.g., alpha-prior = (3,2,1), protocol-prior=(1,1,5))
    - One scenario in which our prior is very good (e.g., alpha-prior = (3,2,1), protocol-prior=(3,2,1))
    - Do all my baseline methods come with the option to inform a prior?
 - consider different bandwidths within the bayesian approach?
 - how to improve the coverage (or how to increase the amplitude)?
    - Added temperature-calibration, improve things.
    - Is temperature-calibration actually not equivalent to using a larger bandwidth in the kernels?
 - consider W as a measure of quantification error (the current e.g., w-CI is the winkler...)
 - optimize also C and class_weight? [I don't think so, but could be done easily now]
--- a/BayesianKDEy/_bayeisan_kdey.py
+++ b/BayesianKDEy/_bayeisan_kdey.py
@ -66,7 +66,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
            raise ValueError(f'parameter {num_samples=} must be a positive integer')
        assert explore in ['simplex', 'clr', 'ilr'], \
            f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"'
-        assert temperature>0., f'temperature must be >0'
+        # assert temperature>0., f'temperature must be >0'
        assert engine in ['rw-mh', 'emcee', 'numpyro']
        super().__init__(classifier, fit_classifier, val_split)
--- a/BayesianKDEy/full_experiments.py
+++ b/BayesianKDEy/full_experiments.py
@ -9,6 +9,7 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold
 from copy import deepcopy as cp
 import quapy as qp
 from BayesianKDEy._bayeisan_kdey import BayesianKDEy
 from BayesianKDEy.temperature_calibration import temp_calibration
 from build.lib.quapy.data import LabelledCollection
 from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ
 from quapy.method.base import BinaryQuantifier, BaseQuantifier
@ -72,14 +73,22 @@ def methods():
    # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary
    #
    # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
-    # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
+    yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
    # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method
    # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method
    # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method
    # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass
    # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass
    yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method
-    yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, step_size=.1, engine='numpyro', **hyper), multiclass_method
+    yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
    yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method
    yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
    yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro',  **hyper), multiclass_method
    yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
    yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method
    yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method
    yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
    yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
 def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
@ -114,7 +123,12 @@ def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method
        )
        t_init = time()
-        withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
+        withconf_quantifier = withconf_constructor(best_hyperparams)
        if hasattr(withconf_quantifier, 'temperature') and withconf_quantifier.temperature is None:
            train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
            temperature = temp_calibration(withconf_quantifier, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1)
            withconf_quantifier.temperature = temperature
        withconf_quantifier.fit(*training.Xy)
        tr_time = time() - t_init
        # test
@ -155,13 +169,13 @@ if __name__ == '__main__':
    binary = {
        'datasets': qp.datasets.UCI_BINARY_DATASETS,
        'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
-        'sample_size': 100  # previous: 500
+        'sample_size': 500  # previous: small 100, big 500
    }
    multiclass = {
        'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
        'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
-        'sample_size': 200 # previous: 1000
+        'sample_size': 1000 # previous: small 200, big 1000
    }
    result_dir = Path('./results')
--- a/BayesianKDEy/generate_results.py
+++ b/BayesianKDEy/generate_results.py
@ -80,7 +80,30 @@ def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwar
 # methods = None  # show all methods
-methods = ['BayesianACC', 'BayesianKDEy', 'BaKDE-emcee', 'BaKDE-numpyro']
+methods = ['BayesianACC', #'BayesianKDEy',
           #'BaKDE-emcee',
           # 'BaKDE-numpyro',
           # 'BaKDE-numpyro-T2',
           # 'BaKDE-numpyro-T10',
           # 'BaKDE-numpyro-T*',
           # 'BaKDE-Ait-numpyro',
           'BaKDE-Ait-numpyro-T*',
           # 'BaKDE-Ait-numpyro-T*ILR',
           'BootstrapACC',
           'BootstrapHDy',
           'BootstrapKDEy'
           ]
 def nicer(name:str):
    replacements = {
        'Bayesian': 'Ba',
        'Bootstrap': 'Bo',
        'numpyro': 'ro',
        'emcee': 'emc',
    }
    for k, v in replacements.items():
        name = name.replace(k,v)
    return name
 for setup in ['multiclass']:
    path = f'./results/{setup}/*.pkl'
@ -93,7 +116,7 @@ for setup in ['multiclass']:
        report = pickle.load(open(file, 'rb'))
        results = report['results']
        n_samples = len(results['ae'])
-        table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
+        table['method'].extend([nicer(method)] * n_samples)
        table['dataset'].extend([dataset] * n_samples)
        table['ae'].extend(results['ae'])
        table['rae'].extend(results['rae'])
@ -101,28 +124,29 @@ for setup in ['multiclass']:
        # table['a-CI'].extend(results['amplitude'])
        update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True)
-        update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
+        # update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
-        update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
+        # update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
-        update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
+        # update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
        table['c-CI'].extend(report['coverage-CI'])
        table['a-CI'].extend(report['amplitude-CI'])
        table['w-CI'].extend(report['winkler-CI'])
        table['amperr-CI'].extend(report['amperr-CI'])
-        table['c-CE'].extend(report['coverage-CE'])
+        # table['c-CE'].extend(report['coverage-CE'])
-        table['a-CE'].extend(report['amplitude-CE'])
+        # table['a-CE'].extend(report['amplitude-CE'])
-        table['amperr-CE'].extend(report['amperr-CE'])
+        # table['amperr-CE'].extend(report['amperr-CE'])
-        table['c-CLR'].extend(report['coverage-CLR'])
+        # table['c-CLR'].extend(report['coverage-CLR'])
-        table['a-CLR'].extend(report['amplitude-CLR'])
+        # table['a-CLR'].extend(report['amplitude-CLR'])
-        table['amperr-CLR'].extend(report['amperr-CLR'])
+        # table['amperr-CLR'].extend(report['amperr-CLR'])
-        table['c-ILR'].extend(report['coverage-ILR'])
+        # table['c-ILR'].extend(report['coverage-ILR'])
-        table['a-ILR'].extend(report['amplitude-ILR'])
+        # table['a-ILR'].extend(report['amplitude-ILR'])
-        table['amperr-ILR'].extend(report['amperr-ILR'])
+        # table['amperr-ILR'].extend(report['amperr-ILR'])
        table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim']))
        table['SRE'].extend(qp.error.sre(results['true-prevs'], results['point-estim'], report['train-prev'], eps=0.001))
        # table['aitch-well'].extend(qp.error.dist_aitchison(results['true-prevs'], [ConfidenceEllipseILR(samples).mean_ for samples in results['samples']]))
        # table['aitch'].extend()
        # table['reg-score-ILR'].extend(
@ -145,16 +169,20 @@ for setup in ['multiclass']:
        tr_size[dataset] = len(data.training)
    # remove datasets with more than max_classes classes
-    # max_classes = 30
+    max_classes = 25
-    # min_train   = 1000
+    min_train   = 500
-    # for data_name, n in n_classes.items():
+    ignore_datasets = ['poker_hand', 'hcv']
-    #     if n > max_classes:
+    for data_name, n in n_classes.items():
-    #         df = df[df["dataset"] != data_name]
+        if n > max_classes:
-    # for data_name, n in tr_size.items():
+            df = df[df["dataset"] != data_name]
-    #     if n < min_train:
+    for data_name, n in tr_size.items():
-    #         df = df[df["dataset"] != data_name]
+        if n < min_train:
            df = df[df["dataset"] != data_name]
    for data_name, n in tr_size.items():
        if data_name in ignore_datasets:
            df = df[df["dataset"] != data_name]
-    for region in ['CI', 'CE', 'CLR', 'ILR']:
+    for region in ['CI']: #, 'CLR', 'ILR', 'CI']:
        if setup == 'binary' and region=='ILR':
            continue
        # pv = pd.pivot_table(
@ -162,11 +190,12 @@ for setup in ['multiclass']:
        # )
        pv = pd.pivot_table(
            df, index='dataset', columns='method', values=[
-                f'amperr-{region}',
+                # f'amperr-{region}',
                f'a-{region}',
                f'c-{region}',
-                #f'w-{region}',
+                # f'w-{region}',
                'ae',
                'SRE',
                # 'rae',
                # f'aitch',
                # f'aitch-well'
--- a/BayesianKDEy/single_experiment_debug.py
+++ b/BayesianKDEy/single_experiment_debug.py
@ -1,4 +1,5 @@
 import os
 import pickle
 import warnings
 from os.path import join
 from pathlib import Path
@ -27,7 +28,7 @@ from scipy.stats import dirichlet
 from collections import defaultdict
 from time import time
 from sklearn.base import clone, BaseEstimator
-
+from temperature_calibration import temp_calibration
 def methods():
    """
@ -50,7 +51,8 @@ def methods():
    # for T in [10, 20, 50, 100., 500]:
    #     yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper),
-    yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper),
+    # yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper),
    yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', **hyper)
@ -68,8 +70,8 @@ if __name__ == '__main__':
    setup = multiclass
    # data_name = 'isolet'
-    data_name = 'academic-success'
+    # data_name = 'academic-success'
-    # data_name = 'abalone'
+    data_name = 'poker_hand'
    qp.environ['SAMPLE_SIZE'] = setup['sample_size']
    print(f'dataset={data_name}')
@ -86,6 +88,11 @@ if __name__ == '__main__':
              f'coverage={report["results"]["coverage"].mean():.5f}, '
              f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
        # best_hyperparams = pickle.load(open(hyper_path, 'rb'))
        # method = withconf_constructor(best_hyperparams)
        #
        # train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
        # temp_calibration(method, train, val, amplitude_threshold=0.1475, temp_grid=[2., 10., 100., 1000.])#temp_grid=[1., 1.25, 1.5, 1.75, 2.])
--- a/BayesianKDEy/temperature_bandwidth_corr.py
+++ b/BayesianKDEy/temperature_bandwidth_corr.py
@ -0,0 +1,129 @@
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import pandas as pd
 import quapy as qp
 from BayesianKDEy._bayeisan_kdey import BayesianKDEy
 from BayesianKDEy.full_experiments import experiment_path
 from quapy.protocol import UPP
 import numpy as np
 from tqdm import tqdm
 from joblib import Parallel, delayed
 from itertools import product
 # is there a correspondence between smaller bandwidths and overconfident likelihoods? if so, a high temperature
 # after calibration might be correlated; this script aims at analyzing this trend
 datasets = qp.datasets.UCI_MULTICLASS_DATASETS
 def show(results, values):
    df_res = pd.DataFrame(results)
    df_mean = (
        df_res
        .groupby(['bandwidth', 'temperature'], as_index=False)
        .mean(numeric_only=True)
    )
    pivot = df_mean.pivot(
        index='bandwidth',
        columns='temperature',
        values=values
    )
    import matplotlib.pyplot as plt
    plt.imshow(pivot, origin='lower', aspect='auto')
    plt.colorbar(label=values)
    plt.xticks(range(len(pivot.columns)), pivot.columns)
    plt.yticks(range(len(pivot.index)), pivot.index)
    plt.xlabel('Temperature')
    plt.ylabel('Bandwidth')
    plt.title(f'{values} heatmap')
    plt.savefig(f'./plotcorr/{values}.png')
    plt.cla()
    plt.clf()
 def run_experiment(
    bandwidth,
    temperature,
    train,
    test,
    dataset,
 ):
    qp.environ['SAMPLE_SIZE'] = 1000
    bakde = BayesianKDEy(
        engine='numpyro',
        bandwidth=bandwidth,
        temperature=temperature,
    )
    bakde.fit(*train.Xy)
    test_generator = UPP(test, repeats=20, random_state=0)
    rows = []
    for i, (sample, prev) in enumerate(
        tqdm(
            test_generator(),
            desc=f"bw={bandwidth}, T={temperature}",
            total=test_generator.total(),
            leave=False,
        )
    ):
        point_estimate, region = bakde.predict_conf(sample)
        rows.append({
            "bandwidth": bandwidth,
            "temperature": temperature,
            "dataset": dataset,
            "sample": i,
            "mae": qp.error.mae(prev, point_estimate),
            "cov": region.coverage(prev),
            "amp": region.montecarlo_proportion(n_trials=50_000),
        })
    return rows
 bandwidths = [0.001, 0.005, 0.01, 0.05, 0.1]
 temperatures = [0.5, 0.75, 1., 2., 5.]
 res_dir = './plotcorr/results'
 os.makedirs(res_dir, exist_ok=True)
 all_rows = []
 for i, dataset in enumerate(datasets):
    if dataset in ['letter', 'isolet']: continue
    res_path = f'{res_dir}/{dataset}.pkl'
    if os.path.exists(res_path):
        print(f'loading results from pickle {res_path}')
        results_data_rows = pickle.load(open(res_path, 'rb'))
    else:
        print(f'{dataset=} [complete={i}/{len(datasets)}]')
        data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
        train, test = data.train_test
        jobs = list(product(bandwidths, temperatures))
        results_data_rows = Parallel(n_jobs=-1,backend="loky")(
            delayed(run_experiment)(bw, T, train, test, dataset) for bw, T in jobs
        )
        pickle.dump(results_data_rows, open(res_path, 'wb'), pickle.HIGHEST_PROTOCOL)
    all_rows.extend(results_data_rows)
 results = defaultdict(list)
 for rows in all_rows:
    for row in rows:
        for k, v in row.items():
            results[k].append(v)
 show(results, 'mae')
 show(results, 'cov')
 show(results, 'amp')
--- a/BayesianKDEy/temperature_calibration.py
+++ b/BayesianKDEy/temperature_calibration.py
@ -0,0 +1,114 @@
 from build.lib.quapy.data import LabelledCollection
 from quapy.method.confidence import WithConfidenceABC
 from quapy.protocol import UPP
 import numpy as np
 from tqdm import tqdm
 import quapy as qp
 from joblib import Parallel, delayed
 import copy
 def temp_calibration(method:WithConfidenceABC,
                     train:LabelledCollection,
                     val:LabelledCollection,
                     temp_grid=[1, 1.5, 2],
                     num_samples=100,
                     nominal_coverage=0.95,
                     amplitude_threshold='auto',
                     random_state=0,
                     n_jobs=1,
                     verbose=True):
    assert (amplitude_threshold == 'auto' or (isinstance(amplitude_threshold, float)) and amplitude_threshold < 1.), \
        f'wrong value for {amplitude_threshold=}, it must either be "auto" or a float < 1.0.'
    if amplitude_threshold=='auto':
        n_classes = train.n_classes
        amplitude_threshold = .1/np.log(n_classes+1)
    if isinstance(amplitude_threshold, float) and amplitude_threshold > 0.1:
        print(f'warning: the {amplitude_threshold=} is too large; this may lead to uninformative regions')
    method.fit(*train.Xy)
    label_shift_prot = UPP(val, repeats=num_samples, random_state=random_state)
    # results = []
    # temp_grid = sorted(temp_grid)
    # for temp in temp_grid:
    #     method.temperature = temp
    #     coverage = 0
    #     amplitudes = []
    #     errs = []
    #     pbar = tqdm(enumerate(label_shift_prot()), total=label_shift_prot.total(), disable=not verbose)
    #     for i, (sample, prev) in pbar:
    #         point_estim, conf_region = method.predict_conf(sample)
    #         if prev in conf_region:
    #             coverage += 1
    #         amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
    #         errs.append(qp.error.mae(prev, point_estim))
    #         if verbose:
    #             pbar.set_description(
    #                 f'temperature={temp:.2f}, '
    #                 f'coverage={coverage/(i+1):.4f}, '
    #                 f'amplitude={np.mean(amplitudes):.4f},'
    #                 f'mae={np.mean(errs):.4f}'
    #             )
    #
    #     mean_coverage = coverage / label_shift_prot.total()
    #     mean_amplitude = np.mean(amplitudes)
    #
    #     if mean_amplitude < amplitude_threshold:
    #         results.append((temp, mean_coverage, mean_amplitude))
    #     else:
    #         break
    def evaluate_temperature(temp):
        local_method = copy.deepcopy(method)
        local_method.temperature = temp
        coverage = 0
        amplitudes = []
        errs = []
        for i, (sample, prev) in enumerate(label_shift_prot()):
            point_estim, conf_region = local_method.predict_conf(sample)
            if prev in conf_region:
                coverage += 1
            amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
            errs.append(qp.error.mae(prev, point_estim))
        mean_coverage = coverage / label_shift_prot.total()
        mean_amplitude = np.mean(amplitudes)
        return temp, mean_coverage, mean_amplitude
    temp_grid = sorted(temp_grid)
    raw_results = Parallel(n_jobs=n_jobs, backend="loky")(
        delayed(evaluate_temperature)(temp)
        for temp in tqdm(temp_grid, disable=not verbose)
    )
    results = [
        (temp, cov, amp)
        for temp, cov, amp in raw_results
        if amp < amplitude_threshold
    ]
    chosen_temperature = 1.
    if len(results) > 0:
        chosen_temperature = min(results, key=lambda x: abs(x[1]-nominal_coverage))[0]
    print(f'chosen_temperature={chosen_temperature:.2f}')
    return chosen_temperature
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -738,14 +738,29 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
        print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
-    
+
    def dummify_categorical_features(df_features, dataset_id):
        CATEGORICAL_FEATURES = {
            158: ["S1", "C1", "S2", "C2", "S3", "C3", "S4", "C4", "S5", "C5"],  # poker_hand
        }
        categorical = CATEGORICAL_FEATURES.get(dataset_id, [])
        X = df_features.copy()
        if categorical:
            X[categorical] = X[categorical].astype("category")
            X = pd.get_dummies(X, columns=categorical, drop_first=True)
        return X
    def download(id, name):
        df = fetch_ucirepo(id=id)
-        df.data.features = pd.get_dummies(df.data.features, drop_first=True)
+        X_df = dummify_categorical_features(df.data.features, id)
-        X, y = df.data.features.to_numpy(dtype=np.float64), df.data.targets.to_numpy().squeeze()
+        X = X_df.to_numpy(dtype=np.float64)
        y = df.data.targets.to_numpy().squeeze()
-        assert y.ndim == 1, 'more than one y'
+        assert y.ndim == 1, f'error: the dataset {id=} {name=} has more than one target variable'
        classes = np.sort(np.unique(y))
        y = np.searchsorted(classes, y)
--- a/quapy/error.py
+++ b/quapy/error.py
@ -139,7 +139,8 @@ def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
     :math:`\\mathcal{Y}` are the classes of interest
    :param prevs_true: array-like, true prevalence values
    :param prevs_hat: array-like, estimated prevalence values
-    :param prevs_train: array-like, training prevalence values
+    :param prevs_train: array-like with the training prevalence values, or a single prevalence vector when
        all the prevs_true and prevs_hat are computed on the same training set.
    :param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the
        training prevalence is expected to be >0 everywhere.
    :return: the squared ratio error
@ -149,6 +150,8 @@ def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
    prevs_train = np.asarray(prevs_train)
    assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape=} vs {prevs_hat.shape=}'
    assert prevs_true.shape[-1]==prevs_train.shape[-1], 'wrong shape for training prevalence'
    if prevs_true.ndim == 2 and prevs_train.ndim == 1:
        prevs_train = np.tile(prevs_train, reps=(prevs_true.shape[0], 1))
    if eps>0:
        prevs_true = smooth(prevs_true, eps)
        prevs_hat = smooth(prevs_hat, eps)
@ -157,12 +160,13 @@ def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
    N = prevs_true.shape[-1]
    w = prevs_true / prevs_train
    w_hat = prevs_hat / prevs_train
-    return (1./N) * (w - w_hat)**2.
+    return (1./N) * np.sum((w - w_hat)**2., axis=-1)
 def msre(prevs_true, prevs_hat, prevs_train, eps=0.):
    """
-    Returns the mean across all experiments. See :function:`sre`.
+    Returns the mean :function:`sre` across all experiments.
    :param prevs_true: array-like, true prevalence values of shape (n_experiments, n_classes,) or (n_classes,)
    :param prevs_hat: array-like, estimated prevalence values of shape equal to prevs_true
    :param prevs_train: array-like, training prevalence values of  (n_experiments, n_classes,) or (n_classes,)
--- a/quapy/method/confidence.py
+++ b/quapy/method/confidence.py
@ -139,7 +139,7 @@ class WithConfidenceABC(ABC):
    """
    Abstract class for confidence regions.
    """
-    METHODS = ['intervals', 'ellipse', 'ellipse-clr']
+    REGION_TYPE = ['intervals', 'ellipse', 'ellipse-clr', 'ellipse-ilr']
    @abstractmethod
    def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
@ -185,6 +185,8 @@ class WithConfidenceABC(ABC):
            region = ConfidenceEllipseSimplex(prev_estims, confidence_level=confidence_level)
        elif method == 'ellipse-clr':
            region = ConfidenceEllipseCLR(prev_estims, confidence_level=confidence_level)
        elif method == 'ellipse-ilr':
            region = ConfidenceEllipseILR(prev_estims, confidence_level=confidence_level)
        if region is None:
            raise NotImplementedError(f'unknown method {method}')