From 4b7fc77e90855a5b43710c73790736280532cc97 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 12 Jan 2026 15:51:40 +0100 Subject: [PATCH] improved plots --- BayesianKDEy/full_experiments.py | 46 ++++--- BayesianKDEy/generate_results.py | 12 +- BayesianKDEy/plot_simplex.py | 218 +++++++++++++++++++++++++++---- BayesianKDEy/prior_effect.py | 11 ++ docs/source/manuals/datasets.md | 2 +- quapy/data/datasets.py | 23 ++-- quapy/functional.py | 2 +- 7 files changed, 251 insertions(+), 63 deletions(-) create mode 100644 BayesianKDEy/prior_effect.py diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 3ca28f1..c2ac1c8 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -64,32 +64,32 @@ def methods(): only_binary = 'only_binary' only_multiclass = 'only_multiclass' - # yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method + yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method - # yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method + yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method - # yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method + yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary # - # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method - yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method + yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method + # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass - yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method - yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method - yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method - yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method - yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method - yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method + # yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, engine='emcee', **hyper), multiclass_method + # yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy( mcmc_seed=0, engine='numpyro', **hyper), multiclass_method + # yield f'BaKDE-numpyro-T2', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=2., **hyper), multiclass_method + # yield f'BaKDE-numpyro-T*', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method + # yield f'BaKDE-Ait-numpyro', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method + # yield f'BaKDE-Ait-numpyro-T*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, **hyper), multiclass_method yield f'BaKDE-Ait-numpyro-T*-U', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, prior='uniform', **hyper), multiclass_method - yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method - yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method - yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method - yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method + # yield f'BaKDE-Ait-numpyro-T*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, region='ellipse-ilr', **hyper), multiclass_method + # yield f'BaKDE-numpyro-T10', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', temperature=10., **hyper), multiclass_method + # yield f'BaKDE-numpyro*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method + # yield f'BaKDE-numpyro*ILR', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): @@ -165,18 +165,26 @@ def experiment_path(dir:Path, dataset_name:str, method_name:str): return dir/f'{dataset_name}__{method_name}.pkl' +def fetch_UCI_binary(data_name): + return qp.datasets.fetch_UCIBinaryDataset(data_name) + + +def fetch_UCI_multiclass(data_name): + return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01) + + if __name__ == '__main__': binary = { 'datasets': qp.datasets.UCI_BINARY_DATASETS, - 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, - 'sample_size': 500 # previous: small 100, big 500 + 'fetch_fn': fetch_UCI_binary, + 'sample_size': 500 } multiclass = { 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, - 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, - 'sample_size': 1000 # previous: small 200, big 1000 + 'fetch_fn': fetch_UCI_multiclass, + 'sample_size': 1000 } result_dir = Path('./results') diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 862832a..0e9b90c 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -7,6 +7,7 @@ import pandas as pd from glob import glob from pathlib import Path import quapy as qp +from BayesianKDEy.full_experiments import fetch_UCI_multiclass, fetch_UCI_binary from error import dist_aitchison from quapy.method.confidence import ConfidenceIntervals from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC @@ -87,11 +88,12 @@ methods = ['BayesianACC', #'BayesianKDEy', # 'BaKDE-numpyro-T10', # 'BaKDE-numpyro-T*', # 'BaKDE-Ait-numpyro', - 'BaKDE-Ait-numpyro-T*', - # 'BaKDE-Ait-numpyro-T*ILR', + # 'BaKDE-Ait-numpyro-T*', + 'BaKDE-Ait-numpyro-T*-U', 'BootstrapACC', 'BootstrapHDy', - 'BootstrapKDEy' + 'BootstrapKDEy', + 'BootstrapEMQ' ] def nicer(name:str): @@ -161,8 +163,8 @@ for setup in ['multiclass']: tr_size = {} for dataset in df['dataset'].unique(): fetch_fn = { - 'binary': qp.datasets.fetch_UCIBinaryDataset, - 'multiclass': qp.datasets.fetch_UCIMulticlassDataset + 'binary': fetch_UCI_binary, + 'multiclass': fetch_UCI_multiclass }[setup] data = fetch_fn(dataset) n_classes[dataset] = data.n_classes diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index d0128ba..ec752be 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -36,7 +36,10 @@ def get_region_colormap(name="blue", alpha=0.40): def plot_prev_points(samples=None, show_samples=True, true_prev=None, - point_estim=None, train_prev=None, show_mean=True, show_legend=True, + point_estim=None, + train_prev=None, + show_mean=True, + show_legend=True, region=None, region_resolution=1000, confine_region_in_simplex=False, @@ -100,9 +103,7 @@ def plot_prev_points(samples=None, else: in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool) - # --- Colormap 0 → blanco, 1 → rojo semitransparente --- - - # iterar sobre todas las regiones + # iterate over regions for (rname, rfun) in region_list: mask = np.zeros_like(in_simplex, dtype=float) valid_pts = pts_bary[in_simplex] @@ -127,7 +128,7 @@ def plot_prev_points(samples=None, else: raise ValueError(f'show_mean should either be a boolean (if True, then samples must be provided) or ' f'the mean point itself') - if train_prev is not None: + if true_prev is not None: ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') if point_estim is not None: ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') @@ -210,17 +211,112 @@ def plot_prev_points_matplot(points): ax.axis('off') plt.show() +# -------- new function + +def cartesian(p): + dim = p.shape[-1] + p = np.atleast_2d(p) + x = p[:, 1] + p[:, 2] * 0.5 + y = p[:, 2] * np.sqrt(3) / 2 + return x, y + + +def barycentric_from_xy(x, y): + """ + Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3). + """ + p3 = 2 * y / np.sqrt(3) + p2 = x - 0.5 * p3 + p1 = 1 - p2 - p3 + return np.stack([p1, p2, p3], axis=-1) + + +def plot_regions(ax, region_layers, resolution, confine): + xs = np.linspace(-0.2, 1.2, resolution) + ys = np.linspace(-0.2, np.sqrt(3)/2 + 0.2, resolution) + grid_x, grid_y = np.meshgrid(xs, ys) + + pts_bary = barycentric_from_xy(grid_x, grid_y) + + if confine: + mask_simplex = np.all(pts_bary >= 0, axis=-1) + else: + mask_simplex = np.ones(grid_x.shape, dtype=bool) + + for region in region_layers: + mask = np.zeros_like(mask_simplex, dtype=float) + valid_pts = pts_bary[mask_simplex] + mask_vals = np.array([float(region["fn"](p)) for p in valid_pts]) + mask[mask_simplex] = mask_vals + + ax.pcolormesh( + xs, ys, mask, + shading="auto", + cmap=get_region_colormap(region.get("color", "blue")), + alpha=region.get("alpha", 0.3), + label=region.get("label", None), + ) + + +def plot_points(ax, point_layers): + for layer in point_layers: + pts = layer["points"] + style = layer.get("style", {}) + ax.scatter( + *cartesian(pts), + label=layer.get("label", None), + **style + ) + + +def plot_simplex( + point_layers=None, + region_layers=None, + region_resolution=1000, + confine_region_in_simplex=False, + show_legend=True, + save_path=None, +): + fig, ax = plt.subplots(figsize=(6, 6)) + + if region_layers: + plot_regions(ax, region_layers, region_resolution, confine_region_in_simplex) + + if point_layers: + plot_points(ax, point_layers) + + # simplex edges + triangle = np.array([[0,0],[1,0],[0.5,np.sqrt(3)/2],[0,0]]) + ax.plot(triangle[:,0], triangle[:,1], color="black") + + # labels + ax.text(-0.05, -0.05, "Y=1", ha="right", va="top") + ax.text(1.05, -0.05, "Y=2", ha="left", va="top") + ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha="center", va="bottom") + + ax.set_aspect("equal") + ax.axis("off") + + if show_legend: + ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5)) + + plt.tight_layout() + if save_path: + plt.savefig(save_path) + else: + plt.show() + + if __name__ == '__main__': np.random.seed(1) - n = 1000 - # alpha = [3,5,10] - alpha = [10,1,1] - prevs = np.random.dirichlet(alpha, size=n) + # n = 1000 + # alpha = [1,1,1] + # prevs = np.random.dirichlet(alpha, size=n) - def regions(): - confs = [0.99, 0.95, 0.90] - yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] + # def regions(): + # confs = [0.99, 0.95, 0.90] + # yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] # yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] # yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] # yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] @@ -234,25 +330,89 @@ if __name__ == '__main__': # save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png', # ) - - def regions(): - confs = [0.99, 0.95, 0.90] - yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] + # def regions(): + # confs = [0.99, 0.95, 0.90] + # yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] # yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] # yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] # yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] # yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] - resolution = 1000 - alpha_str = ','.join([f'{str(i)}' for i in alpha]) - region = ILR(prevs, confidence_level=.99) - p = np.asarray([0.1, 0.8, 0.1]) - plot_prev_points(prevs, show_samples=False, - show_mean=region.mean_, - # show_mean=prevs.mean(axis=0), - show_legend=False, region=[('', region.coverage)], region_resolution=resolution, - color='blue', - true_prev=p, - train_prev=region.closest_point_in_region(p), - save_path=f'./plots3/simplex_ilr.png', - ) + # resolution = 100 + # alpha_str = ','.join([f'{str(i)}' for i in alpha]) + # region = CI(prevs, confidence_level=.95, bonferroni_correction=True) + # p = None # np.asarray([0.1, 0.8, 0.1]) + # plot_prev_points(prevs, + # show_samples=True, + # show_mean=None, + # # show_mean=prevs.mean(axis=0), + # show_legend=False, + # # region=[('', region.coverage)], + # # region_resolution=resolution, + # color='blue', + # true_prev=p, + # # train_prev=region.closest_point_in_region(p), + # save_path=f'./plots/prior_test/uniform.png', + # ) + + plt.rcParams.update({ + 'font.size': 10, + 'axes.titlesize': 12, + 'axes.labelsize': 10, + 'xtick.labelsize': 8, + 'ytick.labelsize': 8, + 'legend.fontsize': 9, + }) + + n = 1000 + train_style = {"color": "blue", "alpha": 0.5, "s":15, 'linewidth':0.5, 'edgecolors':None} + test_style = {"color": "red", "alpha": 0.5, "s": 15, 'linewidth': 0.5, 'edgecolors': None} + + # train_prevs = np.random.dirichlet(alpha=[1, 1, 1], size=n) + # test_prevs = np.random.dirichlet(alpha=[1, 1, 1], size=n) + # plot_simplex( + # point_layers=[ + # {"points": train_prevs, "label": "train", "style": train_style}, + # {"points": test_prevs, "label": "test", "style": test_style}, + # ], + # save_path=f'./plots/prior_test/uniform.png' + # ) + + alpha = [40, 10, 10] + train_prevs = np.random.dirichlet(alpha=alpha, size=n) + test_prevs = np.random.dirichlet(alpha=alpha, size=n) + plot_simplex( + point_layers=[ + {"points": train_prevs, "label": "train", "style": train_style}, + {"points": test_prevs, "label": "test", "style": test_style}, + ], + save_path=f'./plots/prior_test/informative.png' + ) + + # train_prevs = np.random.dirichlet(alpha=[8, 1, 1], size=n) + # test_prevs = np.random.dirichlet(alpha=[1, 8, 1], size=n) + # plot_simplex( + # point_layers=[ + # {"points": train_prevs, "label": "train", "style": train_style}, + # {"points": test_prevs, "label": "test", "style": test_style}, + # ], + # save_path=f'./plots/prior_test/wrong.png' + # ) + + p = 0.6 + + K = 3 + alpha = [p] + [(1. - p) / (K - 1)] * (K - 1) + alpha = np.array(alpha) + + for c in [100, 500, 1_000]: + alpha_c = alpha * c + train_prevs = np.random.dirichlet(alpha=alpha_c, size=n) + test_prevs = np.random.dirichlet(alpha=alpha_c[::-1], size=n) + plot_simplex( + point_layers=[ + {"points": train_prevs, "label": "train", "style": train_style}, + {"points": test_prevs, "label": "test", "style": test_style}, + ], + save_path=f'./plots/prior_test/concentration_{c}.png' + ) diff --git a/BayesianKDEy/prior_effect.py b/BayesianKDEy/prior_effect.py new file mode 100644 index 0000000..6e0462b --- /dev/null +++ b/BayesianKDEy/prior_effect.py @@ -0,0 +1,11 @@ +import numpy as np + +n = 3 + +p = 0.5 + +alpha = [p] + [(1.-p)/(n-1)]*(n-1) +alpha = np.array(alpha) + +for c in [1_000, 5_000, 10_000]: + print(alpha*c) \ No newline at end of file diff --git a/docs/source/manuals/datasets.md b/docs/source/manuals/datasets.md index b7d8827..0fe72ed 100644 --- a/docs/source/manuals/datasets.md +++ b/docs/source/manuals/datasets.md @@ -294,7 +294,7 @@ The datasets correspond to a part of the datasets that can be retrieved from the * containing at least 1,000 instances * can be imported using the Python API. -Some statistics about these datasets are displayed below : +Some statistics about these datasets (after applying default filters) are displayed below : | **Dataset** | **classes** | **instances** | **features** | **prevs** | **type** | |:------------|:-----------:|:-------------:|:------------:|:----------|:--------:| diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 7dc81ec..f7c8be4 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -663,8 +663,8 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas :param dataset_name: a dataset name :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default ~/quay_data/ directory) - :param min_class_support: minimum number of istances per class. Classes with fewer instances - are discarded (deafult is 100) + :param min_class_support: integer or float, the minimum number or proportion of istances per class. + Classes with fewer instances are discarded (deafult is 100). :param standardize: indicates whether the covariates should be standardized or not (default is True). :param verbose: set to True (default is False) to get information (stats) about the dataset :return: a :class:`quapy.data.base.LabelledCollection` instance @@ -673,7 +673,12 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas f'Name {dataset_name} does not match any known dataset from the ' \ f'UCI Machine Learning datasets repository (multiclass). ' \ f'Valid ones are {UCI_MULTICLASS_DATASETS}' - + + assert (min_class_support is None or + ((isinstance(min_class_support, int) and min_class_support>=0) or + (isinstance(min_class_support, float) and 0. <= min_class_support < 1.))), \ + f'invalid value for {min_class_support=}; expected non negative integer or float in [0,1)' + if data_home is None: data_home = get_quapy_home() @@ -766,12 +771,14 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas y = np.searchsorted(classes, y) return LabelledCollection(X, y) - def filter_classes(data: LabelledCollection, min_ipc): - if min_ipc is None: - min_ipc = 0 + def filter_classes(data: LabelledCollection, min_class_support): + if min_class_support is None or min_class_support == 0.: + return data + if isinstance(min_class_support, float): + min_class_support = int(len(data) * min_class_support) classes = data.classes_ - # restrict classes to only those with at least min_ipc instances - classes = classes[data.counts() >= min_ipc] + # restrict classes to only those with at least min_class_support instances + classes = classes[data.counts() >= min_class_support] # filter X and y keeping only datapoints belonging to valid classes filter_idx = np.isin(data.y, classes) X, y = data.X[filter_idx], data.y[filter_idx] diff --git a/quapy/functional.py b/quapy/functional.py index 29fe137..9f265d8 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -282,7 +282,7 @@ def l1_norm(prevalences: ArrayLike) -> np.ndarray: """ n_classes = prevalences.shape[-1] accum = prevalences.sum(axis=-1, keepdims=True) - prevalences = np.true_divide(prevalences, accum, where=accum > 0) + prevalences = np.true_divide(prevalences, accum, where=accum > 0, out=None) allzeros = accum.flatten() == 0 if any(allzeros): if prevalences.ndim == 1: