code used to generate plots

choosing plots for paper
added DMx and DMy, with a classmethod that returns HDx and HDy respectively
2023-11-13 12:07:59 +01:00 · 2023-11-10 14:22:43 +01:00 · 2023-11-09 18:13:54 +01:00 · 2023-11-09 14:20:41 +01:00 · 2023-11-08 18:11:45 +01:00 · 2023-11-08 16:13:48 +01:00
23 changed files with 1080 additions and 91 deletions
--- a/CACM2023_plots/plots_CACM2023_3histograms.py
+++ b/CACM2023_plots/plots_CACM2023_3histograms.py
@ -0,0 +1,73 @@
+import itertools
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+
+palette = itertools.cycle(sns.color_palette())
+
+def setframe():
+    fig.spines['top'].set_visible(False)
+    fig.spines['left'].set_visible(False)
+    fig.get_yaxis().set_ticks([])
+    fig.spines['right'].set_visible(False)
+    # fig.axis('off')
+
+nbins = 50
+figsize = (5, 2)
+ymax = 0.2
+
+negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000)
+negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
+
+plt.figure(figsize=figsize)
+plt.xlim(0, 1)
+plt.ylim(0, ymax)
+fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins,  stat='probability', color=next(palette))
+plt.title('Negative distribution')
+fig.set(yticklabels=[])
+fig.set(ylabel=None)
+setframe()
+# fig.get_figure().savefig('plots_cacm/negatives.pdf')
+# plt.clf()
+
+# -------------------------------------------------------------
+
+positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000)
+positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1)
+positives = np.concatenate([positives1, positives2])
+np.random.shuffle(positives)
+positives = np.asarray([x for x in positives if 0 <= x <= 1])
+
+# plt.figure(figsize=figsize)
+plt.xlim(0, 1)
+plt.ylim(0, ymax)
+fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins,  stat='probability', color=next(palette))
+plt.title('')
+fig.set(yticklabels=[])
+fig.set(ylabel=None)
+setframe()
+fig.get_figure().savefig('plots_cacm/training.pdf')
+
+# -------------------------------------------------------------
+
+prev = 0.2
+test = np.concatenate([
+    negatives[:int(len(negatives)*(1-prev))],
+    positives[:int(len(positives)*(prev))],
+])
+
+
+plt.figure(figsize=figsize)
+plt.xlim(0, 1)
+plt.ylim(0, ymax)
+fig = sns.histplot(data=test, binrange=(0,1), bins=nbins,  stat='probability', color=next(palette))
+plt.title('')
+fig.set(yticklabels=[])
+fig.set(ylabel=None)
+setframe()
+fig.get_figure().savefig('plots_cacm/test.pdf')
+
+
--- a/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py
+++ b/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py
@ -0,0 +1,86 @@
+from copy import deepcopy
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+from method.non_aggregative import DMx
+from protocol import APP
+from quapy.method.aggregative import CC, ACC, DMy
+from sklearn.svm import LinearSVC
+
+qp.environ['SAMPLE_SIZE'] = 100
+DATASETS = qp.datasets.UCI_DATASETS[10:]
+
+def fit_eval_task(args):
+    model_name, model, train, test = args
+    with qp.util.temp_seed(0):
+        model = deepcopy(model)
+        model.fit(train)
+        true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+    return model_name, true_prev, estim_prev
+
+
+def gen_data():
+
+    def base_classifier():
+        return LogisticRegression()
+        #return LinearSVC(class_weight='balanced')
+
+
+    def models():
+        yield 'CC', CC(base_classifier())
+        yield 'ACC', ACC(base_classifier())
+        yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1)
+        yield 'HDx', DMx(nbins=10, n_jobs=-1)
+
+    # train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test
+    method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
+
+    for dataset_name in DATASETS:
+        train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
+        print(dataset_name, train.X.shape)
+
+        outs = qp.util.parallel(
+            fit_eval_task,
+            ((method_name, model, train, test) for method_name, model in models()),
+            seed=0,
+            n_jobs=-1
+        )
+
+        for method_name, true_prev, estim_prev in outs:
+            method_names.append(method_name)
+            true_prevs.append(true_prev)
+            estim_prevs.append(estim_prev)
+            tr_prevs.append(train.prevalence())
+
+    return method_names, true_prevs, estim_prevs, tr_prevs
+
+method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data)
+
+def remove_dataset(dataset_order, num_methods=4):
+    sel_names, sel_true, sel_estim, sel_tr = [],[],[],[]
+    for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)):
+        dataset_pos = i//num_methods
+        if dataset_pos not in dataset_order:
+            sel_names.append(name)
+            sel_true.append(true)
+            sel_estim.append(estim)
+            sel_tr.append(tr)
+    return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr)
+
+print(DATASETS)
+selected = 10
+for i in [selected]:
+    print(i, DATASETS[i])
+    all_ = set(range(len(DATASETS)))
+    remove_index = sorted(all_ - {i})
+    sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4)
+
+    p=sel_tr[0][1]
+    sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names]
+
+    # qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png')
+    qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="")
+    # qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
+    # qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')
--- a/CACM2023_plots/plots_CACM2023_histogram3D.py
+++ b/CACM2023_plots/plots_CACM2023_histogram3D.py
@ -0,0 +1,62 @@
+
+import math
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split, cross_val_predict
+from sklearn.neighbors import KernelDensity
+import matplotlib.pyplot as plt
+import numpy as np
+
+from data import LabelledCollection
+
+scale = 100
+
+
+import quapy as qp
+
+negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000)
+negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
+
+positives = np.random.normal(loc = 0.75, scale=0.05, size=20000)
+positives = np.asarray([x for x in positives if 0 <= x <= 1])
+
+prev = 0.1
+test = np.concatenate([
+    negatives[:int(len(negatives)*(1-prev))],
+    positives[:int(len(positives)*(prev))],
+])
+
+
+nbins = 30
+
+plt.rcParams.update({'font.size': 7})
+
+fig = plt.figure()
+positions = np.asarray([2,1,0])
+colors = ['r', 'g', 'b']
+
+
+ax = fig.add_subplot(111, projection='3d')
+ax.set_box_aspect((3, 1, 0.8))
+
+for post, c, z in zip([test, positives, negatives], colors, positions):
+
+    hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True)
+    xs = (bins[:-1] + bins[1:])/2
+
+    ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
+
+
+ax.yaxis.set_ticks(positions)
+ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution'])
+# ax.xaxis.set_ticks([])
+# ax.xaxis.set_ticklabels([], minor=True)
+ax.zaxis.set_ticks([])
+ax.zaxis.set_ticklabels([], minor=True)
+
+
+#plt.figure(figsize=(10,6))
+#plt.show()
+plt.savefig('./histograms3d_CACM2023.pdf')
+
+
--- a/CACM2023_plots/plotting_diagonal_4methods.py
+++ b/CACM2023_plots/plotting_diagonal_4methods.py
@ -0,0 +1,59 @@
+from sklearn.decomposition import TruncatedSVD
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import GridSearchCV
+
+import quapy as qp
+from data import LabelledCollection
+from method.non_aggregative import DMx
+from protocol import APP
+from quapy.method.aggregative import CC, DMy, ACC
+from sklearn.svm import LinearSVC
+import numpy as np
+from tqdm import tqdm
+
+qp.environ['SAMPLE_SIZE'] = 500
+
+def cls():
+    return LogisticRegressionCV(n_jobs=-1,Cs=10)
+
+def gen_methods():
+    yield CC(cls()), 'CC$_{10' + '\%}$'
+    yield ACC(cls()), 'ACC'
+    yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy'
+    yield DMx(nbins=10, n_jobs=-1), 'HDx'
+
+def gen_data():
+
+    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
+
+    method_data = []
+    training_prevalence = 0.1
+    training_size = 5000
+    # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
+    train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0)
+
+    for model, method_name in tqdm(gen_methods(), total=4):
+        with qp.util.temp_seed(1):
+            if method_name == 'HDx':
+                X, y = train_sample.Xy
+                svd = TruncatedSVD(n_components=5, random_state=0)
+                Xred = svd.fit_transform(X)
+                train_sample_dense = LabelledCollection(Xred, y)
+
+                X, y = test.Xy
+                test_dense = LabelledCollection(svd.transform(X), y)
+
+                model.fit(train_sample_dense)
+                true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0))
+            else:
+                model.fit(train_sample)
+                true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+        method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
+
+    return zip(*method_data)
+
+
+method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
+
+qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_4methods.pdf')
+qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_4methods.pdf', title='', show_density=False, show_std=True)
--- a/CACM2023_plots/plotting_diagonal_CCvariants.py
+++ b/CACM2023_plots/plotting_diagonal_CCvariants.py
@ -0,0 +1,40 @@
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import GridSearchCV
+
+import quapy as qp
+from protocol import APP
+from quapy.method.aggregative import CC
+from sklearn.svm import LinearSVC
+import numpy as np
+from tqdm import tqdm
+
+qp.environ['SAMPLE_SIZE'] = 500
+
+def gen_data():
+
+    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
+
+    method_data = []
+    for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9):
+        training_size = 5000
+        # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
+        train_sample = train.sampling(training_size, 1-training_prevalence)
+
+        # cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1)
+        # cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1)
+        # cls.fit(*train_sample.Xy)
+
+        model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10))
+
+        model.fit(train_sample)
+        true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+        method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
+        method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
+
+    return zip(*method_data)
+
+
+method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
+
+qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf')
+# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False)
--- a/README.md
+++ b/README.md
@ -111,3 +111,7 @@ are provided:
 * [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization)
 * [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection)
 * [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting)
+
+## Acknowledgments:
+
+<img src="SoBigData.png" alt="SoBigData++" width="250"/>
--- a/SoBigData.png
+++ b/SoBigData.png
--- a/examples/comparing_HDy_HDx.py
+++ b/examples/comparing_HDy_HDx.py
@ -0,0 +1,74 @@
+from sklearn.linear_model import LogisticRegression
+from time import time
+import pandas as pd
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.protocol import APP
+from quapy.method.aggregative import HDy
+from quapy.method.non_aggregative import DMx
+
+
+"""
+This example is meant to experimentally compare HDy and HDx. 
+The implementations of these methods adhere to the original design of the methods; in particular, this means that
+the number of bins is not an hyperparameter, but is something that the method explores internally (returning the
+median of the estimates as the final prevalence prediction), and the prevalence is not searched through any 
+numerical optimization procedure, but simply as a linear search between 0 and 1 steppy by 0.01.
+See <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ for further details   
+"""
+
+qp.environ['SAMPLE_SIZE']=100
+
+
+df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'])
+
+
+for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)):
+    if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: continue
+
+    collection = qp.datasets.fetch_UCILabelledCollection(dataset_name, verbose=False)
+    train, test = collection.split_stratified()
+
+    # HDy............................................
+    tinit = time()
+    hdy = HDy(LogisticRegression()).fit(train)
+    t_hdy_train = time()-tinit
+
+    tinit = time()
+    hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean()
+    t_hdy_test = time() - tinit
+    df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test]
+
+    # HDx............................................
+    tinit = time()
+    hdx = DMx.HDx(n_jobs=-1).fit(train)
+    t_hdx_train = time() - tinit
+
+    tinit = time()
+    hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean()
+    t_hdx_test = time() - tinit
+    df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test]
+
+# evaluation reports
+
+print('\n'*3)
+print('='*80)
+print('Comparison in terms of performance')
+print('='*80)
+pv = df.pivot_table(index='dataset', columns='method', values=['MAE', 'MRAE'])
+print(pv)
+print('\nAveraged values:')
+print(pv.mean())
+
+print('\n'*3)
+print('='*80)
+print('Comparison in terms of efficiency')
+print('='*80)
+pv = df.pivot_table(index='dataset', columns='method', values=['tr-time', 'te-time'])
+print(pv)
+print('\nAveraged values:')
+print(pv.mean())
+
+
+
--- a/examples/ifcb_experiments.py
+++ b/examples/ifcb_experiments.py
@ -0,0 +1,28 @@
+import quapy as qp
+from sklearn.linear_model import LogisticRegression
+from quapy.evaluation import evaluation_report
+
+
+def newLR():
+    return LogisticRegression(n_jobs=-1)
+
+
+quantifiers = [
+    ('CC', qp.method.aggregative.CC(newLR())),
+    ('ACC', qp.method.aggregative.ACC(newLR())),
+    ('PCC', qp.method.aggregative.PCC(newLR())),
+    ('PACC', qp.method.aggregative.PACC(newLR())),
+    ('HDy', qp.method.aggregative.DMy(newLR())),
+    ('EMQ', qp.method.aggregative.EMQ(newLR()))
+]
+
+
+for quant_name, quantifier in quantifiers:
+    print("Experiment with "+quant_name)
+
+    train, test_gen = qp.datasets.fetch_IFCB()
+
+    quantifier.fit(train)
+
+    report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
+    print(report.mean())
--- a/examples/model_selection.py
+++ b/examples/model_selection.py
@ -1,6 +1,6 @@
 import quapy as qp
 from quapy.protocol import APP
-from quapy.method.aggregative import DistributionMatching
+from quapy.method.aggregative import DMy
 from sklearn.linear_model import LogisticRegression
 import numpy as np

@ -8,7 +8,7 @@ import numpy as np
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """

-model = DistributionMatching(LogisticRegression())
+model = DMy(LogisticRegression())

 qp.environ['SAMPLE_SIZE'] = 100
 qp.environ['N_JOBS'] = -1
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@ -1,13 +1,18 @@
 Change Log 0.1.8
 ----------------

+- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py")
 - New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding
    to the following criteria:
        - >1000 instances
        - >2 classes
        - classification datasets
        - Python API available
- Added NAE, NRAE
+- New IFCB (plankton) dataset added. See fetch_IFCB.
+- Added new evaluation measures NAE, NRAE
+- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary
+    of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of
+    hyperparameters, and that returns, as the prevalence estimates, the median across all predictions.

 Change Log 0.1.7
 ----------------
--- a/quapy/init.py
+++ b/quapy/init.py
@ -11,7 +11,7 @@ from . import util
 from . import model_selection
 from . import classification

-__version__ = '0.1.7'
+__version__ = '0.1.8'

 environ = {
    'SAMPLE_SIZE': None,
--- a/quapy/data/_ifcb.py
+++ b/quapy/data/_ifcb.py
@ -0,0 +1,51 @@
+import os
+import pandas as pd
+from quapy.protocol import AbstractProtocol
+
+class IFCBTrainSamplesFromDir(AbstractProtocol):
+
+    def __init__(self, path_dir:str, classes: list):
+        self.path_dir = path_dir
+        self.classes = classes
+        self.samples = []
+        for filename in os.listdir(path_dir):
+            if filename.endswith('.csv'):
+                self.samples.append(filename)
+
+    def __call__(self):
+        for sample in self.samples:
+            s = pd.read_csv(os.path.join(self.path_dir,sample))
+            # all columns but the first where we get the class
+            X = s.iloc[:, 1:].to_numpy()
+            y = s.iloc[:, 0].to_numpy()
+            yield X, y
+
+    def total(self):
+        """
+        Returns the total number of samples that the protocol generates.
+
+        :return: The number of training samples to generate.
+        """
+        return len(self.samples)
+
+
+class IFCBTestSamples(AbstractProtocol):
+
+    def __init__(self, path_dir:str, test_prevalences_path: str):
+        self.path_dir = path_dir
+        self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
+
+    def __call__(self):
+        for _, test_sample in self.test_prevalences.iterrows():
+            #Load the sample from disk
+            X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
+            prevalences = test_sample.iloc[1:].to_numpy().astype(float)
+            yield X, prevalences
+
+    def total(self):
+        """
+        Returns the total number of samples that the protocol generates.
+
+        :return: The number of test samples to generate.
+        """
+        return len(self.test_prevalences.index)
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -6,8 +6,7 @@ import os
 import zipfile
 from os.path import join
 import pandas as pd
-import scipy
-import quapy
+from ucimlrepo import fetch_ucirepo
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
 from quapy.data.reader import *
@ -45,6 +44,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                'wine-q-red', 'wine-q-white',
                'yeast']

+UCI_MULTICLASS_DATASETS = ['dry-bean',
+                           'wine-quality',
+                           'academic-success',
+                           'digits',
+                           'letter']
+
 LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']

 _TXA_SAMPLE_SIZE = 250
@ -364,7 +369,8 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
    elif verbose:
        print('no file description available')

-    print(f'Loading {dataset_name} ({fullname})')
+    if verbose:
+        print(f'Loading {dataset_name} ({fullname})')
    if identifier == 'acute':
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')

@ -545,7 +551,111 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
        y = binarize(y, pos_class='NUC')

    data = LabelledCollection(X, y)
-    data.stats()
+    if verbose:
+        data.stats()
+    return data
+
+
+def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
+    """
+    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. 
+
+    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
+    - It has more than 1000 instances
+    - It is suited for classification
+    - It has more than two classes
+    - It is available for Python import (requires ucimlrepo package)
+
+    >>> import quapy as qp
+    >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
+    >>> train, test = dataset.train_test
+    >>>     ...
+
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
+
+    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (stats) about the dataset
+    :return: a :class:`quapy.data.base.Dataset` instance
+    """
+    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+
+
+def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
+    """
+    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
+
+    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
+    - It has more than 1000 instances
+    - It is suited for classification
+    - It has more than two classes
+    - It is available for Python import (requires ucimlrepo package)
+    
+    >>> import quapy as qp
+    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
+    >>> X, y = collection.Xy
+    >>>     ...
+
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
+
+    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (stats) about the dataset
+    :return: a :class:`quapy.data.base.LabelledCollection` instance
+    """
+    assert dataset_name in UCI_MULTICLASS_DATASETS, \
+        f'Name {dataset_name} does not match any known dataset from the ' \
+        f'UCI Machine Learning datasets repository (multiclass). ' \
+        f'Valid ones are {UCI_MULTICLASS_DATASETS}'
+    
+    if data_home is None:
+        data_home = get_quapy_home()
+    
+    identifiers = {
+        "dry-bean": 602,
+        "wine-quality": 186,
+        "academic-success": 697,
+        "digits": 80,
+        "letter": 59
+    }
+    
+    full_names = {
+        "dry-bean": "Dry Bean Dataset",
+        "wine-quality": "Wine Quality",
+        "academic-success": "Predict students' dropout and academic success",
+        "digits": "Optical Recognition of Handwritten Digits",
+        "letter": "Letter Recognition"
+    }
+    
+    identifier = identifiers[dataset_name]
+    fullname = full_names[dataset_name]
+
+    if verbose:
+        print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
+
+    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
+    
+    def download(id):
+        data = fetch_ucirepo(id=id)
+        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
+        classes = np.sort(np.unique(y))
+        y = np.searchsorted(classes, y)
+        return LabelledCollection(X, y)
+
+    data = pickled_resource(file, download, identifier)
+
+    if verbose:
+        data.stats()
+        
    return data


@ -624,12 +734,38 @@ def fetch_lequa2022(task, data_home=None):

    return train, val_gen, test_gen

-def fetch_IFCB(data_home=None):
+
+def fetch_IFCB(single_sample_train=True, data_home=None):
+    """
+    Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
+    information on this dataset check the zenodo site.
+    This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
+    The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
+
+    Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
+
+    The datasets are downloaded only once, and stored for fast reuse.
+
+    :param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
+        :class:`quapy.data.base.LabelledCollection` (all examples together).
+        If False, a generator of training samples will be returned.
+        Each example in the training set has an individual class label.
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :return: a tuple `(train, test_gen)` where `train` is an instance of
+        :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
+        :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
+        returns a series of samples labelled example by example.
+        test_gen is an instance of  :class:`quapy.data._ifcb.IFCBTestSamples`,
+        i.e., a sampling protocol that returns a series of samples labelled by prevalence.
+    """
+
+    from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples

    if data_home is None:
        data_home = get_quapy_home()
-
-    URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
+    
+    URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'

@ -637,13 +773,43 @@ def fetch_IFCB(data_home=None):
    os.makedirs(ifcb_dir, exist_ok=True)

    def download_unzip_and_remove(unzipped_path, url):
-        tmp_path = join(ifcb_dir, 'tmp.zip')
+        tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)

-    if not os.path.exists(join(ifcb_dir, task)):
-        download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
+    if not os.path.exists(os.path.join(ifcb_dir,'train')):
+        download_unzip_and_remove(ifcb_dir, URL_TRAIN)
+    if not os.path.exists(os.path.join(ifcb_dir,'test')):
        download_unzip_and_remove(ifcb_dir, URL_TEST)
-        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
+    if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
+        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
+
+    # Load test prevalences and classes
+    test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
+    test_true_prev = pd.read_csv(test_true_prev_path)
+    classes = test_true_prev.columns[1:]
+
+    #Load train samples
+    train_samples_path = join(ifcb_dir,'train')
+    train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
+
+    #Load test samples
+    test_samples_path = join(ifcb_dir,'test')
+    test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
+
+    # In the case the user wants it, join all the train samples in one LabelledCollection
+    if single_sample_train:
+        X = []
+        y = []
+        for X_, y_ in train_gen():
+            X.append(X_)
+            y.append(y_)   
+
+        X = np.vstack(X)
+        y = np.concatenate(y)
+        train = LabelledCollection(X,y, classes=classes)
+        return train, test_gen
+    else:
+        return train_gen, test_gen
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -1,5 +1,7 @@
 import itertools
 from collections import defaultdict
+from typing import Union, Callable
+
 import scipy
 import numpy as np

@ -64,7 +66,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
        return prevalences


-def HellingerDistance(P, Q):
+def HellingerDistance(P, Q) -> float:
    """
    Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`.
    The HD for two discrete distributions of `k` bins is defined as:
@ -276,3 +278,70 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
        return False
    return True

+
+def get_divergence(divergence: Union[str, Callable]):
+    if isinstance(divergence, str):
+        if divergence=='HD':
+            return HellingerDistance
+        elif divergence=='topsoe':
+            return TopsoeDistance
+        else:
+            raise ValueError(f'unknown divergence {divergence}')
+    elif callable(divergence):
+        return divergence
+    else:
+        raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
+
+
+def argmin_prevalence(loss, n_classes, method='optim_minimize'):
+    if method == 'optim_minimize':
+        return optim_minimize(loss, n_classes)
+    elif method == 'linear_search':
+        return linear_search(loss, n_classes)
+    elif method == 'ternary_search':
+        raise NotImplementedError()
+    else:
+        raise NotImplementedError()
+
+
+def optim_minimize(loss, n_classes):
+    """
+    Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
+    that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
+    SLSQP routine.
+
+    :param loss: (callable) the function to minimize
+    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
+    :return: (ndarray) the best prevalence vector found
+    """
+    from scipy import optimize
+
+    # the initial point is set as the uniform distribution
+    uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+
+    # solutions are bounded to those contained in the unit-simplex
+    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
+    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
+    r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+    return r.x
+
+
+def linear_search(loss, n_classes):
+    """
+    Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring
+    the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the
+    early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`.
+
+    :param loss: (callable) the function to minimize
+    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
+    :return: (ndarray) the best prevalence vector found
+    """
+    assert n_classes==2, 'linear search is only available for binary problems'
+
+    prev_selected, min_score = None, None
+    for prev in prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
+        score = loss(np.asarray([1 - prev, prev]))
+        if min_score is None or score < min_score:
+            prev_selected, min_score = prev, score
+
+    return np.asarray([1 - prev_selected, prev_selected])
--- a/quapy/method/init.py
+++ b/quapy/method/init.py
@ -1,7 +1,7 @@
-from . import aggregative
 from . import base
-from . import meta
+from . import aggregative
 from . import non_aggregative
+from . import meta

 AGGREGATIVE_METHODS = {
    aggregative.CC,
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -9,6 +9,7 @@ from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import cross_val_predict
 import quapy as qp
 import quapy.functional as F
+from functional import get_divergence
 from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
 from quapy.classification.svmperf import SVMperf
 from quapy.data import LabelledCollection
@ -530,7 +531,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
    """
    `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
    HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
-    minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior
+    minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior
    probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and
    the other is generated from a validation set. This latter distribution is defined as a mixture of the
    class-conditional distributions of the posterior probabilities returned for the positive and negative validation
@ -567,10 +568,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
        # pre-compute the histogram for positive and negative examples
        self.bins = np.linspace(10, 110, 11, dtype=int)  # [10, 20, 30, ..., 100, 110]
-        self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
-                             self.bins}
-        self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
-                             self.bins}
+        def hist(P, bins):
+            h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0]
+            return h / h.sum()
+        self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins}
+        self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins}
        return self

    def aggregate(self, classif_posteriors):
@ -590,6 +592,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):

            Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)

+            # the authors proposed to search for the prevalence yielding the best matching as a linear search
+            # at small steps (modern implementations resort to an optimization procedure,
+            # see class DistributionMatching)
            prev_selected, min_dist = None, None
            for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
                Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
@ -602,20 +607,6 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        return np.asarray([1 - class1_prev, class1_prev])


-def _get_divergence(divergence: Union[str, Callable]):
-    if isinstance(divergence, str):
-        if divergence=='HD':
-            return F.HellingerDistance
-        elif divergence=='topsoe':
-            return F.TopsoeDistance
-        else:
-            raise ValueError(f'unknown divergence {divergence}')
-    elif callable(divergence):
-        return divergence
-    else:
-        raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
-
-
 class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
    """
    `DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
@ -673,7 +664,7 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        Px = classif_posteriors[:, 1]  # takes only the P(y=+1|x)

        Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
-        divergence = _get_divergence(self.divergence)
+        divergence = get_divergence(self.divergence)

        def distribution_distance(prev):
            Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
@ -722,10 +713,11 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        return np.asarray([1 - class1_prev, class1_prev])


-class DistributionMatching(AggregativeProbabilisticQuantifier):
+class DMy(AggregativeProbabilisticQuantifier):
    """
-    Generic Distribution Matching quantifier for binary or multiclass quantification.
-    This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
+    Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
+    probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
+    as hyperparameters.

    :param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
@ -738,18 +730,28 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
    :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
        or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
        Distance)
-    :param cdf: whether or not to use CDF instead of PDF (default False)
+    :param cdf: whether to use CDF instead of PDF (default False)
    :param n_jobs: number of parallel workers (default None)
    """

-    def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
+    def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD',
+                 cdf=False, search='optim_minimize', n_jobs=None):
        self.classifier = classifier
        self.val_split = val_split
        self.nbins = nbins
        self.divergence = divergence
        self.cdf = cdf
+        self.search = search
        self.n_jobs = n_jobs

+    @classmethod
+    def HDy(cls, classifier, val_split=0.4, n_jobs=None):
+        from quapy.method.meta import MedianEstimator
+
+        hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
+        hdy = MedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
+        return hdy
+
    def __get_distributions(self, posteriors):
        histograms = []
        post_dims = posteriors.shape[1]
@ -770,8 +772,8 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
        """
        Trains the classifier (if requested) and generates the validation distributions out of the training data.
        The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
-        channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
-        are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
+        channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]`
+        are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete
        distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
        is the fraction of instances with a value in the `k`-th bin.

@ -803,26 +805,20 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
        `n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
        independently. The matching is computed as an average of the divergence across all channels.

-        :param instances: instances in the sample
+        :param posteriors: posterior probabilities of the instances in the sample
        :return: a vector of class prevalence estimates
        """
        test_distribution = self.__get_distributions(posteriors)
-        divergence = _get_divergence(self.divergence)
+        divergence = get_divergence(self.divergence)
        n_classes, n_channels, nbins = self.validation_distribution.shape
-        def match(prev):
+        def loss(prev):
            prev = np.expand_dims(prev, axis=0)
            mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
            divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
            return np.mean(divs)

-        # the initial point is set as the uniform distribution
-        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+        return F.argmin_prevalence(loss, n_classes, method=self.search)

-        # solutions are bounded to those contained in the unit-simplex
-        bounds = tuple((0, 1) for x in range(n_classes))  # values in [0,1]
-        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
-        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-        return r.x


 def newELM(svmperf_base=None, loss='01', C=1):
@ -1224,17 +1220,6 @@ class MS2(MS):
        return np.median(tprs), np.median(fprs)


-ClassifyAndCount = CC
-AdjustedClassifyAndCount = ACC
-ProbabilisticClassifyAndCount = PCC
-ProbabilisticAdjustedClassifyAndCount = PACC
-ExpectationMaximizationQuantifier = EMQ
-SLD = EMQ
-HellingerDistanceY = HDy
-MedianSweep = MS
-MedianSweep2 = MS2
-
-
 class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
    """
    Allows any binary quantifier to perform quantification on single-label datasets.
@ -1292,3 +1277,18 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
        # the estimation for the positive class prevalence
        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]

+
+#---------------------------------------------------------------
+# aliases
+#---------------------------------------------------------------
+
+ClassifyAndCount = CC
+AdjustedClassifyAndCount = ACC
+ProbabilisticClassifyAndCount = PCC
+ProbabilisticAdjustedClassifyAndCount = PACC
+ExpectationMaximizationQuantifier = EMQ
+DistributionMatchingY = DMy
+SLD = EMQ
+HellingerDistanceY = HDy
+MedianSweep = MS
+MedianSweep2 = MS2
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -1,3 +1,4 @@
+import itertools
 from copy import deepcopy
 from typing import Union
 import numpy as np
@ -10,13 +11,14 @@ import quapy as qp
 from quapy import functional as F
 from quapy.data import LabelledCollection
 from quapy.model_selection import GridSearchQ
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ

 try:
    from . import neural
 except ModuleNotFoundError:
    neural = None
-from .base import BaseQuantifier
-from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
+

 if neural:
    QuaNet = neural.QuaNetTrainer
@ -24,6 +26,67 @@ else:
    QuaNet = "QuaNet is not available due to missing torch package"


+class MedianEstimator(BinaryQuantifier):
+    """
+    This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the
+    estimation returned by differently (hyper)parameterized base quantifiers.
+    The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions,
+    i.e., in cases of binary quantification.
+
+    :param base_quantifier: the base, binary quantifier
+    :param random_state: a seed to be set before fitting any base quantifier (default None)
+    :param param_grid: the grid or parameters towards which the median will be computed
+    :param n_jobs: number of parllel workes
+    """
+    def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None):
+        self.base_quantifier = base_quantifier
+        self.param_grid = param_grid
+        self.random_state = random_state
+        self.n_jobs = qp._get_njobs(n_jobs)
+
+    def get_params(self, deep=True):
+        return self.base_quantifier.get_params(deep)
+
+    def set_params(self, **params):
+        self.base_quantifier.set_params(**params)
+
+    def _delayed_fit(self, args):
+        with qp.util.temp_seed(self.random_state):
+            params, training = args
+            model = deepcopy(self.base_quantifier)
+            model.set_params(**params)
+            model.fit(training)
+            return model
+
+    def fit(self, training: LabelledCollection):
+        self._check_binary(training, self.__class__.__name__)
+        params_keys = list(self.param_grid.keys())
+        params_values = list(self.param_grid.values())
+        hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
+        self.models = qp.util.parallel(
+            self._delayed_fit,
+            ((params, training) for params in hyper),
+            seed=qp.environ.get('_R_SEED', None),
+            n_jobs=self.n_jobs
+        )
+        return self
+
+    def _delayed_predict(self, args):
+        model, instances = args
+        return model.quantify(instances)
+
+    def quantify(self, instances):
+        prev_preds = qp.util.parallel(
+            self._delayed_predict,
+            ((model, instances) for model in self.models),
+            seed=qp.environ.get('_R_SEED', None),
+            n_jobs=self.n_jobs
+        )
+        prev_preds = np.asarray(prev_preds)
+        return np.median(prev_preds, axis=0)
+
+
+
 class Ensemble(BaseQuantifier):
    VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES

--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,5 +1,10 @@
+from typing import Union, Callable
+import numpy as np
+
+from functional import get_divergence
 from quapy.data import LabelledCollection
-from .base import BaseQuantifier
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+import quapy.functional as F


 class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
@ -33,3 +38,126 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
        """
        return self.estimated_prevalence

+
+class DMx(BaseQuantifier):
+    """
+    Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
+    This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
+
+    :param nbins: number of bins used to discretize the distributions (default 8)
+    :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
+        or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
+        Distance)
+    :param cdf: whether to use CDF instead of PDF (default False)
+    :param n_jobs: number of parallel workers (default None)
+    """
+
+    def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None):
+        self.nbins = nbins
+        self.divergence = divergence
+        self.cdf = cdf
+        self.search = search
+        self.n_jobs = n_jobs
+
+    @classmethod
+    def HDx(cls, n_jobs=None):
+        """
+        `Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
+        HDx is a method for training binary quantifiers, that models quantification as the problem of
+        minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
+        histograms of two representations, one for the unlabelled examples, and another generated from the training
+        examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
+        the estimates of the class prevalence values.
+
+        The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median.
+        The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01.
+
+        :param n_jobs: number of parallel workers
+        :return: an instance of this class setup to mimick the performance of the HDx as originally proposed by
+            González-Castro, Alaiz-Rodríguez, Alegre (2013)
+        """
+        from quapy.method.meta import MedianEstimator
+
+        dmx = DMx(divergence='HD', cdf=False, search='linear_search')
+        nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)}
+        hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs)
+        return hdx
+
+    def __get_distributions(self, X):
+
+        histograms = []
+        for feat_idx in range(self.nfeats):
+            feature = X[:, feat_idx]
+            feat_range = self.feat_ranges[feat_idx]
+            hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0]
+            norm_hist = hist / hist.sum()
+            histograms.append(norm_hist)
+        distributions = np.vstack(histograms)
+
+        if self.cdf:
+            distributions = np.cumsum(distributions, axis=1)
+
+        return distributions
+
+    def fit(self, data: LabelledCollection):
+        """
+        Generates the validation distributions out of the training data (covariates).
+        The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats`
+        the number of features, and `nbins` the number of bins.
+        In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from
+        training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in
+        training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin.
+
+        :param data: the training set
+        """
+        X, y = data.Xy
+
+        self.nfeats = X.shape[1]
+        self.feat_ranges = _get_features_range(X)
+
+        self.validation_distribution = np.asarray(
+            [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
+        )
+
+        return self
+
+    def quantify(self, instances):
+        """
+        Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
+        (the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
+        The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice)
+        between all feature-specific discrete distributions.
+
+        :param instances: instances in the sample
+        :return: a vector of class prevalence estimates
+        """
+
+        assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}'
+
+        test_distribution = self.__get_distributions(instances)
+        divergence = get_divergence(self.divergence)
+        n_classes, n_feats, nbins = self.validation_distribution.shape
+        def loss(prev):
+            prev = np.expand_dims(prev, axis=0)
+            mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
+            divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
+            return np.mean(divs)
+
+        return F.argmin_prevalence(loss, n_classes, method=self.search)
+
+
+
+def _get_features_range(X):
+    feat_ranges = []
+    ncols = X.shape[1]
+    for col_idx in range(ncols):
+        feature = X[:,col_idx]
+        feat_ranges.append((np.min(feature), np.max(feature)))
+    return feat_ranges
+
+
+#---------------------------------------------------------------
+# aliases
+#---------------------------------------------------------------
+
+DistributionMatchingX = DMx
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -223,7 +223,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
    for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
        quantifier.fit(train)
        fold_prev = quantifier.quantify(test.X)
-        rel_size = len(test.X)/len(data)
+        rel_size = 1. * len(test) / len(data)
        total_prev += fold_prev*rel_size

    return total_prev
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -72,7 +72,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
        train_prev = train_prev[pos_class]
        ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)

-    ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
+    ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title)
    ax.set_ylim(0, 1)
    ax.set_xlim(0, 1)

@ -216,9 +216,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
                   show_density=True,
                   show_legend=True,
                   logscale=False,
-                   title=f'Quantification error as a function of distribution shift',
+                   title=f'Quantification error as a function of label shift',
                   vlines=None,
                   method_order=None,
+                   fontsize=18,
                   savepath=None):
    """
    Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
@ -247,6 +248,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
    :param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
    """

+    plt.rcParams['font.size'] = fontsize
+
    fig, ax = plt.subplots()
    ax.grid()

@ -261,7 +264,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
    if method_order is None:
        method_order = method_names

-    _set_colors(ax, n_methods=len(method_order))
+    # _set_colors(ax, n_methods=len(method_order))

    bins = np.linspace(0, 1, n_bins+1)
    binwidth = 1 / n_bins
@ -291,6 +294,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
        ys = np.asarray(ys)
        ystds = np.asarray(ystds)

+        # if ys[-1]<ys[-2]:
+        #     ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2
+
        min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
        min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
        max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
@ -313,8 +319,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
        ax2.spines['right'].set_color('g')
        ax2.tick_params(axis='y', colors='g')
    
-    ax.set(xlabel=f'Distribution shift between training set and test sample',
-           ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
+    ax.set(xlabel=f'Amount of label shift',
+           ylabel=f'Absolute error',
           title=title)
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
@ -329,10 +335,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
    
    
    if show_legend:
-        fig.legend(loc='lower center',
-                  bbox_to_anchor=(1, 0.5),
-                  ncol=(len(method_names)+1)//2)
-      
+        ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5))
+        # fig.legend(loc='lower center',
+        #           bbox_to_anchor=(1, 0.5),
+        #           ncol=(len(method_names)+1)//2)
+
    _save_or_show(savepath)


--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@ -236,7 +236,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
                raise RuntimeError(
                    f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) "
                    f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to "
-                    f"None for bypassing this check, or to a higher number.")
+                    f"None, or to a higher number, for bypassing this check.")

        self.collator = OnLabelledCollectionProtocol.get_collator(return_type)

--- a/quapy/tests/test_methods.py
+++ b/quapy/tests/test_methods.py
@ -1,14 +1,17 @@
-import numpy
+import numpy as np
 import pytest
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import LinearSVC

 import quapy as qp
+from quapy.model_selection import GridSearchQ
 from quapy.method.base import BinaryQuantifier
 from quapy.data import Dataset, LabelledCollection
 from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
-from quapy.method.aggregative import ACC, PACC, HDy
 from quapy.method.meta import Ensemble
+from quapy.protocol import APP
+from quapy.method.aggregative import DMy
+from quapy.method.meta import MedianEstimator

 datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
            pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
@ -36,7 +39,7 @@ def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
    true_prevalences = dataset.test.prevalence()
    error = qp.error.mae(true_prevalences, estim_prevalences)

-    assert type(error) == numpy.float64
+    assert type(error) == np.float64


@pytest.mark.parametrize('dataset', datasets)
@ -55,7 +58,7 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
    true_prevalences = dataset.test.prevalence()
    error = qp.error.mae(true_prevalences, estim_prevalences)

-    assert type(error) == numpy.float64
+    assert type(error) == np.float64


@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS)
@ -80,7 +83,7 @@ def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
    true_prevalences = dataset.test.prevalence()
    error = qp.error.mae(true_prevalences, estim_prevalences)

-    assert type(error) == numpy.float64
+    assert type(error) == np.float64


 def test_quanet_method():
@ -119,7 +122,7 @@ def test_quanet_method():
    true_prevalences = dataset.test.prevalence()
    error = qp.error.mae(true_prevalences, estim_prevalences)

-    assert type(error) == numpy.float64
+    assert type(error) == np.float64


 def test_str_label_names():
@ -130,32 +133,103 @@ def test_str_label_names():
                      dataset.test.sampling(1000, 0.25, 0.75))
    qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)

-    numpy.random.seed(0)
+    np.random.seed(0)
    model.fit(dataset.training)

    int_estim_prevalences = model.quantify(dataset.test.instances)
    true_prevalences = dataset.test.prevalence()

    error = qp.error.mae(true_prevalences, int_estim_prevalences)
-    assert type(error) == numpy.float64
+    assert type(error) == np.float64

    dataset_str = Dataset(LabelledCollection(dataset.training.instances,
                                             ['one' if label == 1 else 'zero' for label in dataset.training.labels]),
                          LabelledCollection(dataset.test.instances,
                                             ['one' if label == 1 else 'zero' for label in dataset.test.labels]))
    assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation'
-    numpy.random.seed(0)
+    np.random.seed(0)
    model.fit(dataset_str.training)

    str_estim_prevalences = model.quantify(dataset_str.test.instances)
    true_prevalences = dataset_str.test.prevalence()

    error = qp.error.mae(true_prevalences, str_estim_prevalences)
-    assert type(error) == numpy.float64
+    assert type(error) == np.float64

    print(true_prevalences)
    print(int_estim_prevalences)
    print(str_estim_prevalences)

-    numpy.testing.assert_almost_equal(int_estim_prevalences[1],
+    np.testing.assert_almost_equal(int_estim_prevalences[1],
                                      str_estim_prevalences[list(model.classes_).index('one')])
+
+# helper
+def __fit_test(quantifier, train, test):
+    quantifier.fit(train)
+    test_samples = APP(test)
+    true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples)
+    return qp.error.mae(true_prevs, estim_prevs), estim_prevs
+
+
+def test_median_meta():
+    """
+    This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions
+    of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is
+    computed across different values of nbins
+    """
+
+    qp.environ['SAMPLE_SIZE'] = 100
+
+    # grid of values
+    nbins_grid = list(range(2, 11))
+
+    dataset = 'kindle'
+    train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
+    prevs = []
+    errors = []
+    for nbins in nbins_grid:
+        with qp.util.temp_seed(0):
+            q = DMy(LogisticRegression(), nbins=nbins)
+            mae, estim_prevs = __fit_test(q, train, test)
+            prevs.append(estim_prevs)
+            errors.append(mae)
+            print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}')
+    prevs = np.asarray(prevs)
+    mae = np.mean(errors)
+    print(f'\tMAE={mae:.4f}')
+
+    q = DMy(LogisticRegression())
+    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
+    median_mae, prev = __fit_test(q, train, test)
+    print(f'\tMAE={median_mae:.4f}')
+
+    np.testing.assert_almost_equal(np.median(prevs, axis=0), prev)
+    assert median_mae < mae, 'the median-based quantifier provided a higher error...'
+
+
+def test_median_meta_modsel():
+    """
+    This test checks the median-meta quantifier with model selection
+    """
+
+    qp.environ['SAMPLE_SIZE'] = 100
+
+    dataset = 'kindle'
+    train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
+    train, val = train.split_stratified(random_state=0)
+
+    nbins_grid = [2, 4, 5, 10, 15]
+
+    q = DMy(LogisticRegression())
+    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
+    median_mae, _ = __fit_test(q, train, test)
+    print(f'\tMAE={median_mae:.4f}')
+
+    q = DMy(LogisticRegression())
+    lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
+    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
+    q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
+    optimized_median_ave, _ = __fit_test(q, train, test)
+    print(f'\tMAE={optimized_median_ave:.4f}')
+
+    assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	bb0950fad5	code used to generate plots	2023-11-13 12:07:59 +01:00
Alejandro Moreo Fernandez	2e992a0b9a	choosing plots for paper	2023-11-10 14:22:43 +01:00
Alejandro Moreo Fernandez	29db15ae25	added DMx and DMy, with a classmethod that returns HDx and HDy respectively	2023-11-09 18:13:54 +01:00
Alejandro Moreo Fernandez	daca2bd1cb	added MedianEstimator quantifier	2023-11-09 14:20:41 +01:00
Alejandro Moreo Fernandez	66ad7295df	fix in DistributionMatchingX	2023-11-08 18:11:45 +01:00
Alejandro Moreo Fernandez	c3cf0e2d49	adding DistributionMatchingX, the covariate-specific equivalent counterpart of DistributionMatching	2023-11-08 16:13:48 +01:00
Alejandro Moreo Fernandez	76cf784844	added HDx and an example comparing HDy vs HDx	2023-11-08 15:34:17 +01:00
Alejandro Moreo Fernandez	8a6579428b	implementing the 'total' function of IFCB protocols	2023-11-08 11:31:33 +01:00
Alejandro Moreo Fernandez	f18bce5f80	added dataset IFCB plankton	2023-11-08 11:07:47 +01:00
Alejandro Moreo Fernandez	cc5ab8ad70	Merge branch 'lorenzovolpi-cv_len_fix' into devel	2023-11-08 10:00:44 +01:00
Alejandro Moreo Fernandez	3d4ffcea62	merging cross-val fix	2023-11-08 10:00:25 +01:00
Lorenzo Volpi	5c7fbb2554	cross_val_predict fix added	2023-11-06 02:00:06 +01:00
Lorenzo Volpi	13fe531e12	fix added for cross_val_predict	2023-11-06 01:58:36 +01:00
Lorenzo Volpi	51c3d54aa5	fix added for len of a LabelledCollection	2023-11-06 01:53:52 +01:00
Alejandro Moreo Fernandez	34c60e0870	Merge branch 'AICGijon-uci_multiclass'	2023-10-18 17:51:37 +02:00
Alejandro Moreo Fernandez	ea71559722	revised	2023-10-18 17:50:46 +02:00
pglez82	ffab2131a8	fixing requests	2023-10-18 14:12:40 +02:00
pglez82	a9f10f77f4	fixing mistakes	2023-10-17 18:44:28 +02:00
pglez82	239549eb4d	fixing mistakes	2023-10-17 18:44:02 +02:00
pglez82	72fd21471d	fixing mistakes	2023-10-17 18:43:33 +02:00
pglez82	d7192430e4	uci multiclass datasets	2023-10-17 18:24:33 +02:00
Alejandro Moreo Fernandez	5b90656bd1	Update README.md	2023-06-25 13:31:50 +02:00
Alejandro Moreo Fernandez	fd51cd14be	Update README.md	2023-06-25 13:31:33 +02:00
Alejandro Moreo Fernandez	94ca8dec81	Add files via upload	2023-06-25 13:29:38 +02:00
Alejandro Moreo Fernandez	ab070b5cc3	Update README.md	2023-06-25 13:28:48 +02:00