improved plots

This commit is contained in:
Alejandro Moreo Fernandez 2026-01-12 15:51:40 +01:00
parent 17c17ffd0f
commit 4b7fc77e90
7 changed files with 251 additions and 63 deletions

View File

@ -64,32 +64,32 @@ def methods():
only_binary = 'only_binary'
only_multiclass = 'only_multiclass'
# yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method
yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method
yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method
# yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method
yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method
# yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method
yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method
# yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary
#
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
# yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method
# yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method
# yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method
# yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass
# yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass
yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method
yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method
yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
# yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method
# yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
# yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method
# yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
# yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
# yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method
yield f'BaKDE-Ait-numpyro-T*-U', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, prior='uniform', **hyper), multiclass_method
yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method
yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method
yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
# yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method
# yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method
# yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
# yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
@ -165,18 +165,26 @@ def experiment_path(dir:Path, dataset_name:str, method_name:str):
return dir/f'{dataset_name}__{method_name}.pkl'
def fetch_UCI_binary(data_name):
return qp.datasets.fetch_UCIBinaryDataset(data_name)
def fetch_UCI_multiclass(data_name):
return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01)
if __name__ == '__main__':
binary = {
'datasets': qp.datasets.UCI_BINARY_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
'sample_size': 500 # previous: small 100, big 500
'fetch_fn': fetch_UCI_binary,
'sample_size': 500
}
multiclass = {
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
'sample_size': 1000 # previous: small 200, big 1000
'fetch_fn': fetch_UCI_multiclass,
'sample_size': 1000
}
result_dir = Path('./results')

View File

@ -7,6 +7,7 @@ import pandas as pd
from glob import glob
from pathlib import Path
import quapy as qp
from BayesianKDEy.full_experiments import fetch_UCI_multiclass, fetch_UCI_binary
from error import dist_aitchison
from quapy.method.confidence import ConfidenceIntervals
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
@ -87,11 +88,12 @@ methods = ['BayesianACC', #'BayesianKDEy',
# 'BaKDE-numpyro-T10',
# 'BaKDE-numpyro-T*',
# 'BaKDE-Ait-numpyro',
'BaKDE-Ait-numpyro-T*',
# 'BaKDE-Ait-numpyro-T*ILR',
# 'BaKDE-Ait-numpyro-T*',
'BaKDE-Ait-numpyro-T*-U',
'BootstrapACC',
'BootstrapHDy',
'BootstrapKDEy'
'BootstrapKDEy',
'BootstrapEMQ'
]
def nicer(name:str):
@ -161,8 +163,8 @@ for setup in ['multiclass']:
tr_size = {}
for dataset in df['dataset'].unique():
fetch_fn = {
'binary': qp.datasets.fetch_UCIBinaryDataset,
'multiclass': qp.datasets.fetch_UCIMulticlassDataset
'binary': fetch_UCI_binary,
'multiclass': fetch_UCI_multiclass
}[setup]
data = fetch_fn(dataset)
n_classes[dataset] = data.n_classes

View File

@ -36,7 +36,10 @@ def get_region_colormap(name="blue", alpha=0.40):
def plot_prev_points(samples=None,
show_samples=True,
true_prev=None,
point_estim=None, train_prev=None, show_mean=True, show_legend=True,
point_estim=None,
train_prev=None,
show_mean=True,
show_legend=True,
region=None,
region_resolution=1000,
confine_region_in_simplex=False,
@ -100,9 +103,7 @@ def plot_prev_points(samples=None,
else:
in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool)
# --- Colormap 0 → blanco, 1 → rojo semitransparente ---
# iterar sobre todas las regiones
# iterate over regions
for (rname, rfun) in region_list:
mask = np.zeros_like(in_simplex, dtype=float)
valid_pts = pts_bary[in_simplex]
@ -127,7 +128,7 @@ def plot_prev_points(samples=None,
else:
raise ValueError(f'show_mean should either be a boolean (if True, then samples must be provided) or '
f'the mean point itself')
if train_prev is not None:
if true_prev is not None:
ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black')
if point_estim is not None:
ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black')
@ -210,17 +211,112 @@ def plot_prev_points_matplot(points):
ax.axis('off')
plt.show()
# -------- new function
def cartesian(p):
dim = p.shape[-1]
p = np.atleast_2d(p)
x = p[:, 1] + p[:, 2] * 0.5
y = p[:, 2] * np.sqrt(3) / 2
return x, y
def barycentric_from_xy(x, y):
"""
Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3).
"""
p3 = 2 * y / np.sqrt(3)
p2 = x - 0.5 * p3
p1 = 1 - p2 - p3
return np.stack([p1, p2, p3], axis=-1)
def plot_regions(ax, region_layers, resolution, confine):
xs = np.linspace(-0.2, 1.2, resolution)
ys = np.linspace(-0.2, np.sqrt(3)/2 + 0.2, resolution)
grid_x, grid_y = np.meshgrid(xs, ys)
pts_bary = barycentric_from_xy(grid_x, grid_y)
if confine:
mask_simplex = np.all(pts_bary >= 0, axis=-1)
else:
mask_simplex = np.ones(grid_x.shape, dtype=bool)
for region in region_layers:
mask = np.zeros_like(mask_simplex, dtype=float)
valid_pts = pts_bary[mask_simplex]
mask_vals = np.array([float(region["fn"](p)) for p in valid_pts])
mask[mask_simplex] = mask_vals
ax.pcolormesh(
xs, ys, mask,
shading="auto",
cmap=get_region_colormap(region.get("color", "blue")),
alpha=region.get("alpha", 0.3),
label=region.get("label", None),
)
def plot_points(ax, point_layers):
for layer in point_layers:
pts = layer["points"]
style = layer.get("style", {})
ax.scatter(
*cartesian(pts),
label=layer.get("label", None),
**style
)
def plot_simplex(
point_layers=None,
region_layers=None,
region_resolution=1000,
confine_region_in_simplex=False,
show_legend=True,
save_path=None,
):
fig, ax = plt.subplots(figsize=(6, 6))
if region_layers:
plot_regions(ax, region_layers, region_resolution, confine_region_in_simplex)
if point_layers:
plot_points(ax, point_layers)
# simplex edges
triangle = np.array([[0,0],[1,0],[0.5,np.sqrt(3)/2],[0,0]])
ax.plot(triangle[:,0], triangle[:,1], color="black")
# labels
ax.text(-0.05, -0.05, "Y=1", ha="right", va="top")
ax.text(1.05, -0.05, "Y=2", ha="left", va="top")
ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha="center", va="bottom")
ax.set_aspect("equal")
ax.axis("off")
if show_legend:
ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5))
plt.tight_layout()
if save_path:
plt.savefig(save_path)
else:
plt.show()
if __name__ == '__main__':
np.random.seed(1)
n = 1000
# alpha = [3,5,10]
alpha = [10,1,1]
prevs = np.random.dirichlet(alpha, size=n)
# n = 1000
# alpha = [1,1,1]
# prevs = np.random.dirichlet(alpha, size=n)
def regions():
confs = [0.99, 0.95, 0.90]
yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
# def regions():
# confs = [0.99, 0.95, 0.90]
# yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
# yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs]
@ -234,25 +330,89 @@ if __name__ == '__main__':
# save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png',
# )
def regions():
confs = [0.99, 0.95, 0.90]
yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
# def regions():
# confs = [0.99, 0.95, 0.90]
# yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
# yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs]
# yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs]
resolution = 1000
alpha_str = ','.join([f'{str(i)}' for i in alpha])
region = ILR(prevs, confidence_level=.99)
p = np.asarray([0.1, 0.8, 0.1])
plot_prev_points(prevs, show_samples=False,
show_mean=region.mean_,
# show_mean=prevs.mean(axis=0),
show_legend=False, region=[('', region.coverage)], region_resolution=resolution,
color='blue',
true_prev=p,
train_prev=region.closest_point_in_region(p),
save_path=f'./plots3/simplex_ilr.png',
)
# resolution = 100
# alpha_str = ','.join([f'{str(i)}' for i in alpha])
# region = CI(prevs, confidence_level=.95, bonferroni_correction=True)
# p = None # np.asarray([0.1, 0.8, 0.1])
# plot_prev_points(prevs,
# show_samples=True,
# show_mean=None,
# # show_mean=prevs.mean(axis=0),
# show_legend=False,
# # region=[('', region.coverage)],
# # region_resolution=resolution,
# color='blue',
# true_prev=p,
# # train_prev=region.closest_point_in_region(p),
# save_path=f'./plots/prior_test/uniform.png',
# )
plt.rcParams.update({
'font.size': 10,
'axes.titlesize': 12,
'axes.labelsize': 10,
'xtick.labelsize': 8,
'ytick.labelsize': 8,
'legend.fontsize': 9,
})
n = 1000
train_style = {"color": "blue", "alpha": 0.5, "s":15, 'linewidth':0.5, 'edgecolors':None}
test_style = {"color": "red", "alpha": 0.5, "s": 15, 'linewidth': 0.5, 'edgecolors': None}
# train_prevs = np.random.dirichlet(alpha=[1, 1, 1], size=n)
# test_prevs = np.random.dirichlet(alpha=[1, 1, 1], size=n)
# plot_simplex(
# point_layers=[
# {"points": train_prevs, "label": "train", "style": train_style},
# {"points": test_prevs, "label": "test", "style": test_style},
# ],
# save_path=f'./plots/prior_test/uniform.png'
# )
alpha = [40, 10, 10]
train_prevs = np.random.dirichlet(alpha=alpha, size=n)
test_prevs = np.random.dirichlet(alpha=alpha, size=n)
plot_simplex(
point_layers=[
{"points": train_prevs, "label": "train", "style": train_style},
{"points": test_prevs, "label": "test", "style": test_style},
],
save_path=f'./plots/prior_test/informative.png'
)
# train_prevs = np.random.dirichlet(alpha=[8, 1, 1], size=n)
# test_prevs = np.random.dirichlet(alpha=[1, 8, 1], size=n)
# plot_simplex(
# point_layers=[
# {"points": train_prevs, "label": "train", "style": train_style},
# {"points": test_prevs, "label": "test", "style": test_style},
# ],
# save_path=f'./plots/prior_test/wrong.png'
# )
p = 0.6
K = 3
alpha = [p] + [(1. - p) / (K - 1)] * (K - 1)
alpha = np.array(alpha)
for c in [100, 500, 1_000]:
alpha_c = alpha * c
train_prevs = np.random.dirichlet(alpha=alpha_c, size=n)
test_prevs = np.random.dirichlet(alpha=alpha_c[::-1], size=n)
plot_simplex(
point_layers=[
{"points": train_prevs, "label": "train", "style": train_style},
{"points": test_prevs, "label": "test", "style": test_style},
],
save_path=f'./plots/prior_test/concentration_{c}.png'
)

View File

@ -0,0 +1,11 @@
import numpy as np
n = 3
p = 0.5
alpha = [p] + [(1.-p)/(n-1)]*(n-1)
alpha = np.array(alpha)
for c in [1_000, 5_000, 10_000]:
print(alpha*c)

View File

@ -294,7 +294,7 @@ The datasets correspond to a part of the datasets that can be retrieved from the
* containing at least 1,000 instances
* can be imported using the Python API.
Some statistics about these datasets are displayed below :
Some statistics about these datasets (after applying default filters) are displayed below :
| **Dataset** | **classes** | **instances** | **features** | **prevs** | **type** |
|:------------|:-----------:|:-------------:|:------------:|:----------|:--------:|

View File

@ -663,8 +663,8 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
~/quay_data/ directory)
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param min_class_support: integer or float, the minimum number or proportion of istances per class.
Classes with fewer instances are discarded (deafult is 100).
:param standardize: indicates whether the covariates should be standardized or not (default is True).
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.LabelledCollection` instance
@ -673,7 +673,12 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
f'Name {dataset_name} does not match any known dataset from the ' \
f'UCI Machine Learning datasets repository (multiclass). ' \
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
assert (min_class_support is None or
((isinstance(min_class_support, int) and min_class_support>=0) or
(isinstance(min_class_support, float) and 0. <= min_class_support < 1.))), \
f'invalid value for {min_class_support=}; expected non negative integer or float in [0,1)'
if data_home is None:
data_home = get_quapy_home()
@ -766,12 +771,14 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
y = np.searchsorted(classes, y)
return LabelledCollection(X, y)
def filter_classes(data: LabelledCollection, min_ipc):
if min_ipc is None:
min_ipc = 0
def filter_classes(data: LabelledCollection, min_class_support):
if min_class_support is None or min_class_support == 0.:
return data
if isinstance(min_class_support, float):
min_class_support = int(len(data) * min_class_support)
classes = data.classes_
# restrict classes to only those with at least min_ipc instances
classes = classes[data.counts() >= min_ipc]
# restrict classes to only those with at least min_class_support instances
classes = classes[data.counts() >= min_class_support]
# filter X and y keeping only datapoints belonging to valid classes
filter_idx = np.isin(data.y, classes)
X, y = data.X[filter_idx], data.y[filter_idx]

View File

@ -282,7 +282,7 @@ def l1_norm(prevalences: ArrayLike) -> np.ndarray:
"""
n_classes = prevalences.shape[-1]
accum = prevalences.sum(axis=-1, keepdims=True)
prevalences = np.true_divide(prevalences, accum, where=accum > 0)
prevalences = np.true_divide(prevalences, accum, where=accum > 0, out=None)
allzeros = accum.flatten() == 0
if any(allzeros):
if prevalences.ndim == 1: