From ae9503a43b7e505f3278ffecea75483ba159cc72 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 9 Jan 2026 17:20:05 +0100
Subject: [PATCH] calibrating temperature and coming back to Aitchison kernel

---
 BayesianKDEy/TODO.txt                      |  22 +++-
 BayesianKDEy/_bayeisan_kdey.py             |   2 +-
 BayesianKDEy/full_experiments.py           |  24 +++-
 BayesianKDEy/generate_results.py           |  79 +++++++++----
 BayesianKDEy/single_experiment_debug.py    |  15 ++-
 BayesianKDEy/temperature_bandwidth_corr.py | 129 +++++++++++++++++++++
 BayesianKDEy/temperature_calibration.py    | 114 ++++++++++++++++++
 quapy/data/datasets.py                     |  23 +++-
 quapy/error.py                             |  10 +-
 quapy/method/confidence.py                 |   4 +-
 10 files changed, 376 insertions(+), 46 deletions(-)
 create mode 100644 BayesianKDEy/temperature_bandwidth_corr.py
 create mode 100644 BayesianKDEy/temperature_calibration.py

diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt
index 9aea717..c75a2e1 100644
--- a/BayesianKDEy/TODO.txt
+++ b/BayesianKDEy/TODO.txt
@@ -1,7 +1,23 @@
-- Add other methods that natively provide uncertainty quantification methods?
+- Add other methods that natively provide uncertainty quantification methods? (e.g., Ratio estimator, Card & Smith)
 - MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever)
 - Implement Interval Score or Winkler Score
 - analyze across shift
-- add Bayesian EM
-- optimize also C and class_weight?
+- add Bayesian EM:
+    - https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/mapls.py
+    - take this opportunity to add RLLS: https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/rlls.py
+- add CIFAR10 and MNIST? Maybe consider also previously tested types of shift (tweak-one-out, etc.)? from RLLS paper
+    - https://github.com/Angie-Liu/labelshift/tree/master
+    - https://github.com/Angie-Liu/labelshift/blob/master/cifar10_for_labelshift.py
+    - Note: MNIST is downloadable from https://archive.ics.uci.edu/dataset/683/mnist+database+of+handwritten+digits
+- consider prior knowledge in experiments:
+    - One scenario in which our prior is uninformative (i.e., uniform)
+    - One scenario in which our prior is wrong (e.g., alpha-prior = (3,2,1), protocol-prior=(1,1,5))
+    - One scenario in which our prior is very good (e.g., alpha-prior = (3,2,1), protocol-prior=(3,2,1))
+    - Do all my baseline methods come with the option to inform a prior?
+- consider different bandwidths within the bayesian approach?
+- how to improve the coverage (or how to increase the amplitude)?
+    - Added temperature-calibration, improve things.
+    - Is temperature-calibration actually not equivalent to using a larger bandwidth in the kernels?
+- consider W as a measure of quantification error (the current e.g., w-CI is the winkler...)
+- optimize also C and class_weight? [I don't think so, but could be done easily now]
 
diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py
index 69b6523..9a4f171 100644
--- a/BayesianKDEy/_bayeisan_kdey.py
+++ b/BayesianKDEy/_bayeisan_kdey.py
@@ -66,7 +66,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
             raise ValueError(f'parameter {num_samples=} must be a positive integer')
         assert explore in ['simplex', 'clr', 'ilr'], \
             f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"'
-        assert temperature>0., f'temperature must be >0'
+        # assert temperature>0., f'temperature must be >0'
         assert engine in ['rw-mh', 'emcee', 'numpyro']
 
         super().__init__(classifier, fit_classifier, val_split)
diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py
index 4b0afa2..47c4155 100644
--- a/BayesianKDEy/full_experiments.py
+++ b/BayesianKDEy/full_experiments.py
@@ -9,6 +9,7 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold
 from copy import deepcopy as cp
 import quapy as qp
 from BayesianKDEy._bayeisan_kdey import BayesianKDEy
+from BayesianKDEy.temperature_calibration import temp_calibration
 from build.lib.quapy.data import LabelledCollection
 from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ
 from quapy.method.base import BinaryQuantifier, BaseQuantifier
@@ -72,14 +73,22 @@ def methods():
     # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary
     #
     # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
-    # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
+    yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
     # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method
     # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method
     # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method
     # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass
     # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass
     yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method
-    yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, step_size=.1, engine='numpyro', **hyper), multiclass_method
+    yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
+    yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method
+    yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
+    yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro',  **hyper), multiclass_method
+    yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
+    yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method
+    yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method
+    yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
+    yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
 
 
 def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
@@ -114,7 +123,12 @@ def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method
         )
 
         t_init = time()
-        withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
+        withconf_quantifier = withconf_constructor(best_hyperparams)
+        if hasattr(withconf_quantifier, 'temperature') and withconf_quantifier.temperature is None:
+            train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
+            temperature = temp_calibration(withconf_quantifier, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1)
+            withconf_quantifier.temperature = temperature
+        withconf_quantifier.fit(*training.Xy)
         tr_time = time() - t_init
 
         # test
@@ -155,13 +169,13 @@ if __name__ == '__main__':
     binary = {
         'datasets': qp.datasets.UCI_BINARY_DATASETS,
         'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
-        'sample_size': 100  # previous: 500
+        'sample_size': 500  # previous: small 100, big 500
     }
 
     multiclass = {
         'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
         'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
-        'sample_size': 200 # previous: 1000
+        'sample_size': 1000 # previous: small 200, big 1000
     }
 
     result_dir = Path('./results')
diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py
index 5604af4..862832a 100644
--- a/BayesianKDEy/generate_results.py
+++ b/BayesianKDEy/generate_results.py
@@ -80,7 +80,30 @@ def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwar
 
 
 # methods = None  # show all methods
-methods = ['BayesianACC', 'BayesianKDEy', 'BaKDE-emcee', 'BaKDE-numpyro']
+methods = ['BayesianACC', #'BayesianKDEy',
+           #'BaKDE-emcee',
+           # 'BaKDE-numpyro',
+           # 'BaKDE-numpyro-T2',
+           # 'BaKDE-numpyro-T10',
+           # 'BaKDE-numpyro-T*',
+           # 'BaKDE-Ait-numpyro',
+           'BaKDE-Ait-numpyro-T*',
+           # 'BaKDE-Ait-numpyro-T*ILR',
+           'BootstrapACC',
+           'BootstrapHDy',
+           'BootstrapKDEy'
+           ]
+
+def nicer(name:str):
+    replacements = {
+        'Bayesian': 'Ba',
+        'Bootstrap': 'Bo',
+        'numpyro': 'ro',
+        'emcee': 'emc',
+    }
+    for k, v in replacements.items():
+        name = name.replace(k,v)
+    return name
 
 for setup in ['multiclass']:
     path = f'./results/{setup}/*.pkl'
@@ -93,7 +116,7 @@ for setup in ['multiclass']:
         report = pickle.load(open(file, 'rb'))
         results = report['results']
         n_samples = len(results['ae'])
-        table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
+        table['method'].extend([nicer(method)] * n_samples)
         table['dataset'].extend([dataset] * n_samples)
         table['ae'].extend(results['ae'])
         table['rae'].extend(results['rae'])
@@ -101,28 +124,29 @@ for setup in ['multiclass']:
         # table['a-CI'].extend(results['amplitude'])
 
         update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True)
-        update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
-        update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
-        update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
+        # update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
+        # update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
+        # update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
 
         table['c-CI'].extend(report['coverage-CI'])
         table['a-CI'].extend(report['amplitude-CI'])
         table['w-CI'].extend(report['winkler-CI'])
         table['amperr-CI'].extend(report['amperr-CI'])
 
-        table['c-CE'].extend(report['coverage-CE'])
-        table['a-CE'].extend(report['amplitude-CE'])
-        table['amperr-CE'].extend(report['amperr-CE'])
+        # table['c-CE'].extend(report['coverage-CE'])
+        # table['a-CE'].extend(report['amplitude-CE'])
+        # table['amperr-CE'].extend(report['amperr-CE'])
 
-        table['c-CLR'].extend(report['coverage-CLR'])
-        table['a-CLR'].extend(report['amplitude-CLR'])
-        table['amperr-CLR'].extend(report['amperr-CLR'])
+        # table['c-CLR'].extend(report['coverage-CLR'])
+        # table['a-CLR'].extend(report['amplitude-CLR'])
+        # table['amperr-CLR'].extend(report['amperr-CLR'])
 
-        table['c-ILR'].extend(report['coverage-ILR'])
-        table['a-ILR'].extend(report['amplitude-ILR'])
-        table['amperr-ILR'].extend(report['amperr-ILR'])
+        # table['c-ILR'].extend(report['coverage-ILR'])
+        # table['a-ILR'].extend(report['amplitude-ILR'])
+        # table['amperr-ILR'].extend(report['amperr-ILR'])
 
         table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim']))
+        table['SRE'].extend(qp.error.sre(results['true-prevs'], results['point-estim'], report['train-prev'], eps=0.001))
         # table['aitch-well'].extend(qp.error.dist_aitchison(results['true-prevs'], [ConfidenceEllipseILR(samples).mean_ for samples in results['samples']]))
         # table['aitch'].extend()
         # table['reg-score-ILR'].extend(
@@ -145,16 +169,20 @@ for setup in ['multiclass']:
         tr_size[dataset] = len(data.training)
 
     # remove datasets with more than max_classes classes
-    # max_classes = 30
-    # min_train   = 1000
-    # for data_name, n in n_classes.items():
-    #     if n > max_classes:
-    #         df = df[df["dataset"] != data_name]
-    # for data_name, n in tr_size.items():
-    #     if n < min_train:
-    #         df = df[df["dataset"] != data_name]
+    max_classes = 25
+    min_train   = 500
+    ignore_datasets = ['poker_hand', 'hcv']
+    for data_name, n in n_classes.items():
+        if n > max_classes:
+            df = df[df["dataset"] != data_name]
+    for data_name, n in tr_size.items():
+        if n < min_train:
+            df = df[df["dataset"] != data_name]
+    for data_name, n in tr_size.items():
+        if data_name in ignore_datasets:
+            df = df[df["dataset"] != data_name]
 
-    for region in ['CI', 'CE', 'CLR', 'ILR']:
+    for region in ['CI']: #, 'CLR', 'ILR', 'CI']:
         if setup == 'binary' and region=='ILR':
             continue
         # pv = pd.pivot_table(
@@ -162,11 +190,12 @@ for setup in ['multiclass']:
         # )
         pv = pd.pivot_table(
             df, index='dataset', columns='method', values=[
-                f'amperr-{region}',
+                # f'amperr-{region}',
                 f'a-{region}',
                 f'c-{region}',
-                #f'w-{region}',
+                # f'w-{region}',
                 'ae',
+                'SRE',
                 # 'rae',
                 # f'aitch',
                 # f'aitch-well'
diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py
index 7056fc7..18a8f8f 100644
--- a/BayesianKDEy/single_experiment_debug.py
+++ b/BayesianKDEy/single_experiment_debug.py
@@ -1,4 +1,5 @@
 import os
+import pickle
 import warnings
 from os.path import join
 from pathlib import Path
@@ -27,7 +28,7 @@ from scipy.stats import dirichlet
 from collections import defaultdict
 from time import time
 from sklearn.base import clone, BaseEstimator
-
+from temperature_calibration import temp_calibration
 
 def methods():
     """
@@ -50,7 +51,8 @@ def methods():
     # for T in [10, 20, 50, 100., 500]:
     #     yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper),
 
-    yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper),
+    # yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper),
+    yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', **hyper)
 
 
 
@@ -68,8 +70,8 @@ if __name__ == '__main__':
 
     setup = multiclass
     # data_name = 'isolet'
-    data_name = 'academic-success'
-    # data_name = 'abalone'
+    # data_name = 'academic-success'
+    data_name = 'poker_hand'
 
     qp.environ['SAMPLE_SIZE'] = setup['sample_size']
     print(f'dataset={data_name}')
@@ -86,6 +88,11 @@ if __name__ == '__main__':
               f'coverage={report["results"]["coverage"].mean():.5f}, '
               f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
 
+        # best_hyperparams = pickle.load(open(hyper_path, 'rb'))
+        # method = withconf_constructor(best_hyperparams)
+        #
+        # train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
+        # temp_calibration(method, train, val, amplitude_threshold=0.1475, temp_grid=[2., 10., 100., 1000.])#temp_grid=[1., 1.25, 1.5, 1.75, 2.])
 
 
 
diff --git a/BayesianKDEy/temperature_bandwidth_corr.py b/BayesianKDEy/temperature_bandwidth_corr.py
new file mode 100644
index 0000000..562649f
--- /dev/null
+++ b/BayesianKDEy/temperature_bandwidth_corr.py
@@ -0,0 +1,129 @@
+import os.path
+import pickle
+from collections import defaultdict
+from pathlib import Path
+import pandas as pd
+import quapy as qp
+from BayesianKDEy._bayeisan_kdey import BayesianKDEy
+from BayesianKDEy.full_experiments import experiment_path
+from quapy.protocol import UPP
+import numpy as np
+from tqdm import tqdm
+from joblib import Parallel, delayed
+from itertools import product
+
+# is there a correspondence between smaller bandwidths and overconfident likelihoods? if so, a high temperature
+# after calibration might be correlated; this script aims at analyzing this trend
+
+datasets = qp.datasets.UCI_MULTICLASS_DATASETS
+
+def show(results, values):
+    df_res = pd.DataFrame(results)
+    df_mean = (
+        df_res
+        .groupby(['bandwidth', 'temperature'], as_index=False)
+        .mean(numeric_only=True)
+    )
+    pivot = df_mean.pivot(
+        index='bandwidth',
+        columns='temperature',
+        values=values
+    )
+    import matplotlib.pyplot as plt
+
+
+    plt.imshow(pivot, origin='lower', aspect='auto')
+    plt.colorbar(label=values)
+
+    plt.xticks(range(len(pivot.columns)), pivot.columns)
+    plt.yticks(range(len(pivot.index)), pivot.index)
+
+    plt.xlabel('Temperature')
+    plt.ylabel('Bandwidth')
+    plt.title(f'{values} heatmap')
+
+    plt.savefig(f'./plotcorr/{values}.png')
+    plt.cla()
+    plt.clf()
+
+def run_experiment(
+    bandwidth,
+    temperature,
+    train,
+    test,
+    dataset,
+):
+    qp.environ['SAMPLE_SIZE'] = 1000
+    bakde = BayesianKDEy(
+        engine='numpyro',
+        bandwidth=bandwidth,
+        temperature=temperature,
+    )
+    bakde.fit(*train.Xy)
+
+    test_generator = UPP(test, repeats=20, random_state=0)
+
+    rows = []
+
+    for i, (sample, prev) in enumerate(
+        tqdm(
+            test_generator(),
+            desc=f"bw={bandwidth}, T={temperature}",
+            total=test_generator.total(),
+            leave=False,
+        )
+    ):
+        point_estimate, region = bakde.predict_conf(sample)
+
+        rows.append({
+            "bandwidth": bandwidth,
+            "temperature": temperature,
+            "dataset": dataset,
+            "sample": i,
+            "mae": qp.error.mae(prev, point_estimate),
+            "cov": region.coverage(prev),
+            "amp": region.montecarlo_proportion(n_trials=50_000),
+        })
+
+    return rows
+
+
+bandwidths = [0.001, 0.005, 0.01, 0.05, 0.1]
+temperatures = [0.5, 0.75, 1., 2., 5.]
+
+res_dir = './plotcorr/results'
+os.makedirs(res_dir, exist_ok=True)
+
+all_rows = []
+for i, dataset in enumerate(datasets):
+    if dataset in ['letter', 'isolet']: continue
+    res_path = f'{res_dir}/{dataset}.pkl'
+    if os.path.exists(res_path):
+        print(f'loading results from pickle {res_path}')
+        results_data_rows = pickle.load(open(res_path, 'rb'))
+    else:
+        print(f'{dataset=} [complete={i}/{len(datasets)}]')
+        data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
+        train, test = data.train_test
+        jobs = list(product(bandwidths, temperatures))
+        results_data_rows = Parallel(n_jobs=-1,backend="loky")(
+            delayed(run_experiment)(bw, T, train, test, dataset) for bw, T in jobs
+        )
+        pickle.dump(results_data_rows, open(res_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+    all_rows.extend(results_data_rows)
+
+results = defaultdict(list)
+for rows in all_rows:
+    for row in rows:
+        for k, v in row.items():
+            results[k].append(v)
+
+show(results, 'mae')
+show(results, 'cov')
+show(results, 'amp')
+
+
+
+
+
+
diff --git a/BayesianKDEy/temperature_calibration.py b/BayesianKDEy/temperature_calibration.py
new file mode 100644
index 0000000..6153c05
--- /dev/null
+++ b/BayesianKDEy/temperature_calibration.py
@@ -0,0 +1,114 @@
+from build.lib.quapy.data import LabelledCollection
+from quapy.method.confidence import WithConfidenceABC
+from quapy.protocol import UPP
+import numpy as np
+from tqdm import tqdm
+import quapy as qp
+from joblib import Parallel, delayed
+import copy
+
+
+
+
+def temp_calibration(method:WithConfidenceABC,
+                     train:LabelledCollection,
+                     val:LabelledCollection,
+                     temp_grid=[1, 1.5, 2],
+                     num_samples=100,
+                     nominal_coverage=0.95,
+                     amplitude_threshold='auto',
+                     random_state=0,
+                     n_jobs=1,
+                     verbose=True):
+
+    assert (amplitude_threshold == 'auto' or (isinstance(amplitude_threshold, float)) and amplitude_threshold < 1.), \
+        f'wrong value for {amplitude_threshold=}, it must either be "auto" or a float < 1.0.'
+
+    if amplitude_threshold=='auto':
+        n_classes = train.n_classes
+        amplitude_threshold = .1/np.log(n_classes+1)
+
+    if isinstance(amplitude_threshold, float) and amplitude_threshold > 0.1:
+        print(f'warning: the {amplitude_threshold=} is too large; this may lead to uninformative regions')
+
+
+    method.fit(*train.Xy)
+    label_shift_prot = UPP(val, repeats=num_samples, random_state=random_state)
+
+    # results = []
+    # temp_grid = sorted(temp_grid)
+    # for temp in temp_grid:
+    #     method.temperature = temp
+    #     coverage = 0
+    #     amplitudes = []
+    #     errs = []
+    #     pbar = tqdm(enumerate(label_shift_prot()), total=label_shift_prot.total(), disable=not verbose)
+    #     for i, (sample, prev) in pbar:
+    #         point_estim, conf_region = method.predict_conf(sample)
+    #         if prev in conf_region:
+    #             coverage += 1
+    #         amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
+    #         errs.append(qp.error.mae(prev, point_estim))
+    #         if verbose:
+    #             pbar.set_description(
+    #                 f'temperature={temp:.2f}, '
+    #                 f'coverage={coverage/(i+1):.4f}, '
+    #                 f'amplitude={np.mean(amplitudes):.4f},'
+    #                 f'mae={np.mean(errs):.4f}'
+    #             )
+    #
+    #     mean_coverage = coverage / label_shift_prot.total()
+    #     mean_amplitude = np.mean(amplitudes)
+    #
+    #     if mean_amplitude < amplitude_threshold:
+    #         results.append((temp, mean_coverage, mean_amplitude))
+    #     else:
+    #         break
+
+    def evaluate_temperature(temp):
+        local_method = copy.deepcopy(method)
+        local_method.temperature = temp
+
+        coverage = 0
+        amplitudes = []
+        errs = []
+
+        for i, (sample, prev) in enumerate(label_shift_prot()):
+            point_estim, conf_region = local_method.predict_conf(sample)
+
+            if prev in conf_region:
+                coverage += 1
+
+            amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
+            errs.append(qp.error.mae(prev, point_estim))
+
+        mean_coverage = coverage / label_shift_prot.total()
+        mean_amplitude = np.mean(amplitudes)
+
+        return temp, mean_coverage, mean_amplitude
+
+    temp_grid = sorted(temp_grid)
+
+    raw_results = Parallel(n_jobs=n_jobs, backend="loky")(
+        delayed(evaluate_temperature)(temp)
+        for temp in tqdm(temp_grid, disable=not verbose)
+    )
+    results = [
+        (temp, cov, amp)
+        for temp, cov, amp in raw_results
+        if amp < amplitude_threshold
+    ]
+
+    chosen_temperature = 1.
+    if len(results) > 0:
+        chosen_temperature = min(results, key=lambda x: abs(x[1]-nominal_coverage))[0]
+
+    print(f'chosen_temperature={chosen_temperature:.2f}')
+
+    return chosen_temperature
+
+
+
+
+
+
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 801b968..7dc81ec 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -738,14 +738,29 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
         print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
 
     file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
-    
+
+    def dummify_categorical_features(df_features, dataset_id):
+        CATEGORICAL_FEATURES = {
+            158: ["S1", "C1", "S2", "C2", "S3", "C3", "S4", "C4", "S5", "C5"],  # poker_hand
+        }
+
+        categorical = CATEGORICAL_FEATURES.get(dataset_id, [])
+
+        X = df_features.copy()
+        if categorical:
+            X[categorical] = X[categorical].astype("category")
+            X = pd.get_dummies(X, columns=categorical, drop_first=True)
+
+        return X
+
     def download(id, name):
         df = fetch_ucirepo(id=id)
 
-        df.data.features = pd.get_dummies(df.data.features, drop_first=True)
-        X, y = df.data.features.to_numpy(dtype=np.float64), df.data.targets.to_numpy().squeeze()
+        X_df = dummify_categorical_features(df.data.features, id)
+        X = X_df.to_numpy(dtype=np.float64)
+        y = df.data.targets.to_numpy().squeeze()
 
-        assert y.ndim == 1, 'more than one y'
+        assert y.ndim == 1, f'error: the dataset {id=} {name=} has more than one target variable'
 
         classes = np.sort(np.unique(y))
         y = np.searchsorted(classes, y)
diff --git a/quapy/error.py b/quapy/error.py
index 2407eb0..fb3459f 100644
--- a/quapy/error.py
+++ b/quapy/error.py
@@ -139,7 +139,8 @@ def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
      :math:`\\mathcal{Y}` are the classes of interest
     :param prevs_true: array-like, true prevalence values
     :param prevs_hat: array-like, estimated prevalence values
-    :param prevs_train: array-like, training prevalence values
+    :param prevs_train: array-like with the training prevalence values, or a single prevalence vector when
+        all the prevs_true and prevs_hat are computed on the same training set.
     :param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the
         training prevalence is expected to be >0 everywhere.
     :return: the squared ratio error
@@ -149,6 +150,8 @@ def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
     prevs_train = np.asarray(prevs_train)
     assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape=} vs {prevs_hat.shape=}'
     assert prevs_true.shape[-1]==prevs_train.shape[-1], 'wrong shape for training prevalence'
+    if prevs_true.ndim == 2 and prevs_train.ndim == 1:
+        prevs_train = np.tile(prevs_train, reps=(prevs_true.shape[0], 1))
     if eps>0:
         prevs_true = smooth(prevs_true, eps)
         prevs_hat = smooth(prevs_hat, eps)
@@ -157,12 +160,13 @@ def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
     N = prevs_true.shape[-1]
     w = prevs_true / prevs_train
     w_hat = prevs_hat / prevs_train
-    return (1./N) * (w - w_hat)**2.
+    return (1./N) * np.sum((w - w_hat)**2., axis=-1)
 
 
 def msre(prevs_true, prevs_hat, prevs_train, eps=0.):
     """
-    Returns the mean across all experiments. See :function:`sre`.
+    Returns the mean :function:`sre` across all experiments.
+
     :param prevs_true: array-like, true prevalence values of shape (n_experiments, n_classes,) or (n_classes,)
     :param prevs_hat: array-like, estimated prevalence values of shape equal to prevs_true
     :param prevs_train: array-like, training prevalence values of  (n_experiments, n_classes,) or (n_classes,)
diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py
index 914b814..cc0673a 100644
--- a/quapy/method/confidence.py
+++ b/quapy/method/confidence.py
@@ -139,7 +139,7 @@ class WithConfidenceABC(ABC):
     """
     Abstract class for confidence regions.
     """
-    METHODS = ['intervals', 'ellipse', 'ellipse-clr']
+    REGION_TYPE = ['intervals', 'ellipse', 'ellipse-clr', 'ellipse-ilr']
 
     @abstractmethod
     def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
@@ -185,6 +185,8 @@ class WithConfidenceABC(ABC):
             region = ConfidenceEllipseSimplex(prev_estims, confidence_level=confidence_level)
         elif method == 'ellipse-clr':
             region = ConfidenceEllipseCLR(prev_estims, confidence_level=confidence_level)
+        elif method == 'ellipse-ilr':
+            region = ConfidenceEllipseILR(prev_estims, confidence_level=confidence_level)
 
         if region is None:
             raise NotImplementedError(f'unknown method {method}')