From 561b672200575de50fef9d25b046ad8bee44479d Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Tue, 16 Apr 2024 15:12:22 +0200
Subject: [PATCH] updated unit tests

---
 examples/model_selection.py       |  13 +-
 quapy/data/base.py                |   2 +-
 quapy/data/datasets.py            |   8 +-
 quapy/functional.py               |  15 ++
 quapy/method/__init__.py          |   1 +
 quapy/method/_threshold_optim.py  |   2 +-
 quapy/method/aggregative.py       |  19 ++-
 quapy/method/base.py              |   4 +-
 quapy/tests/test_base.py          |   5 -
 quapy/tests/test_datasets.py      |  61 --------
 quapy/tests/test_hierarchy.py     |  65 ---------
 quapy/tests/test_methods.py       | 234 ------------------------------
 quapy/tests/test_modsel.py        |  74 ++++------
 quapy/tests/test_replicability.py |   6 +-
 14 files changed, 82 insertions(+), 427 deletions(-)
 delete mode 100644 quapy/tests/test_base.py
 delete mode 100644 quapy/tests/test_datasets.py
 delete mode 100644 quapy/tests/test_hierarchy.py
 delete mode 100644 quapy/tests/test_methods.py

diff --git a/examples/model_selection.py b/examples/model_selection.py
index 08fbe34..130b542 100644
--- a/examples/model_selection.py
+++ b/examples/model_selection.py
@@ -12,12 +12,11 @@ from time import time
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """
 
-model = KDEyML(LogisticRegression())
+model = DMy(LogisticRegression())
 
 qp.environ['SAMPLE_SIZE'] = 100
 qp.environ['N_JOBS'] = -1
 
-# training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
 
 with qp.util.temp_seed(0):
@@ -34,19 +33,21 @@ with qp.util.temp_seed(0):
 
     # We will explore a classification-dependent hyper-parameter (e.g., the 'C'
     # hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
-    # (e.g., the number of bins in a DistributionMatching quantifier.
+    # (e.g., the number of bins in a DistributionMatching quantifier).
     # Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
     # in order to let the quantifier know this hyper-parameter belongs to its underlying
     # classifier.
+    # We consider 7 values for the classifier and 7 values for the quantifier.
+    # QuaPy is optimized so that only 7 classifiers are trained, and then reused to test the
+    # different configurations of the quantifier. In other words, quapy avoids to train
+    # the classifier 7x7 times.
     param_grid = {
         'classifier__C': np.logspace(-3,3,7),
-        'classifier__class_weight': ['balanced', None],
-        'bandwidth': np.linspace(0.01, 0.2, 20),
+        'nbins': [2, 3, 4, 5, 10, 15, 20]
     }
 
     tinit = time()
 
-    # model = OLD_GridSearchQ(
     model = qp.model_selection.GridSearchQ(
         model=model,
         param_grid=param_grid,
diff --git a/quapy/data/base.py b/quapy/data/base.py
index 2629084..e52230e 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -123,7 +123,7 @@ class LabelledCollection:
         if len(prevs) == self.n_classes - 1:
             prevs = prevs + (1 - sum(prevs),)
         assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
-        assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
+        assert np.isclose(sum(prevs), 1), f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
 
         # Decide how many instances should be taken for each class in order to satisfy the requested prevalence
         # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 5b9806f..0f732e8 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -50,7 +50,9 @@ UCI_MULTICLASS_DATASETS = ['dry-bean',
                            'digits',
                            'letter']
 
-LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
+LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
+LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
+LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS
 
 _TXA_SAMPLE_SIZE = 250
 _TXB_SAMPLE_SIZE = 1000
@@ -209,7 +211,7 @@ def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose
     :return: a :class:`quapy.data.base.Dataset` instance
     """
     data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
-    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
 
 
 def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
@@ -583,7 +585,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
     :return: a :class:`quapy.data.base.Dataset` instance
     """
     data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
-    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
 
 
 def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
diff --git a/quapy/functional.py b/quapy/functional.py
index 856534a..fa17a5c 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -189,6 +189,19 @@ def check_prevalence_vector(prevalences: ArrayLike, raise_exception: bool=False,
         return valid
 
 
+def uniform_prevalence(n_classes):
+    """
+    Returns a vector representing the uniform distribution for `n_classes`
+
+    :param n_classes: number of classes
+    :return: np.ndarray with all values 1/n_classes
+    """
+    assert isinstance(n_classes, int) and n_classes>0, \
+        (f'param {n_classes} not understood; must be a positive integer representing the '
+         f'number of classes ')
+    return np.full(shape=n_classes, fill_value=1./n_classes)
+
+
 def normalize_prevalence(prevalences: ArrayLike, method='l1'):
     """
     Normalizes a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in
@@ -606,3 +619,5 @@ def solve_adjustment(
                 raise ValueError(f"Solver {solver} not known.")
     else:
         raise ValueError(f'unknown {solver=}')
+
+
diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py
index e0d5c1f..e1d6309 100644
--- a/quapy/method/__init__.py
+++ b/quapy/method/__init__.py
@@ -3,6 +3,7 @@ from . import aggregative
 from . import non_aggregative
 from . import meta
 
+
 AGGREGATIVE_METHODS = {
     aggregative.CC,
     aggregative.ACC,
diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py
index 6a38fdb..a9d2723 100644
--- a/quapy/method/_threshold_optim.py
+++ b/quapy/method/_threshold_optim.py
@@ -27,7 +27,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):
+    def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
         self.classifier = classifier
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 5a7812d..3470726 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -82,6 +82,13 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
         :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
         :param fit_classifier: whether to train the learner (default is True). Set to False if the
             learner has been trained outside the quantifier.
+        :param val_split: specifies the data used for generating classifier predictions. This specification
+            can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+            be extracted from the training set; or as an integer (default 5), indicating that the predictions
+            are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+            for `k`); or as a collection defining the specific set of data to use for validation.
+            Alternatively, this set can be specified at fit time by indicating the exact set of data
+            on which the predictions are to be generated.
         :return: self
         """
         self._check_init_parameters()
@@ -111,6 +118,12 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
         if fit_classifier:
             self._check_non_empty_classes(data)
 
+        if predict_on is None:
+            if not fit_classifier:
+                predict_on = data
+                if isinstance(self.val_split, LabelledCollection) and self.val_split!=predict_on:
+                    raise ValueError(f'{fit_classifier=} but a LabelledCollection was provided as val_split '
+                                     f'in __init__ that is not the same as the LabelledCollection provided in fit.')
         if predict_on is None:
             predict_on = self.val_split
 
@@ -467,7 +480,7 @@ class ACC(AggregativeCrispQuantifier):
         if self.method not in ACC.METHODS:
             raise ValueError(f"unknown method; valid ones are {ACC.METHODS}")
         if self.norm not in ACC.NORMALIZATIONS:
-            raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}")
+            raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
         """
@@ -577,8 +590,8 @@ class PACC(AggregativeSoftQuantifier):
             raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}")
         if self.method not in ACC.METHODS:
             raise ValueError(f"unknown method; valid ones are {ACC.METHODS}")
-        if self.clipping not in ACC.NORMALIZATIONS:
-            raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}")
+        if self.norm not in ACC.NORMALIZATIONS:
+            raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
         """
diff --git a/quapy/method/base.py b/quapy/method/base.py
index f34acf6..58cd6f1 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -54,7 +54,7 @@ class OneVsAll:
     pass
 
 
-def newOneVsAll(binary_quantifier, n_jobs=None):
+def newOneVsAll(binary_quantifier: BaseQuantifier, n_jobs=None):
     assert isinstance(binary_quantifier, BaseQuantifier), \
         f'{binary_quantifier} does not seem to be a Quantifier'
     if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier):
@@ -69,7 +69,7 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
     quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
     """
 
-    def __init__(self, binary_quantifier, n_jobs=None):
+    def __init__(self, binary_quantifier: BaseQuantifier, n_jobs=None):
         assert isinstance(binary_quantifier, BaseQuantifier), \
             f'{binary_quantifier} does not seem to be a Quantifier'
         if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier):
diff --git a/quapy/tests/test_base.py b/quapy/tests/test_base.py
deleted file mode 100644
index 4fd9faa..0000000
--- a/quapy/tests/test_base.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import pytest
-
-def test_import():
-    import quapy as qp
-    assert qp.__version__ is not None
diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py
deleted file mode 100644
index 4ed5aa9..0000000
--- a/quapy/tests/test_datasets.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import pytest
-
-from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
-    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_BINARY_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
-    fetch_reviews, fetch_twitter, fetch_UCIBinaryDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
-
-
-@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
-def test_fetch_reviews(dataset_name):
-    dataset = fetch_reviews(dataset_name)
-    print(f'Dataset {dataset_name}')
-    print('Training set stats')
-    dataset.training.stats()
-    print('Test set stats')
-    dataset.test.stats()
-
-
-@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
-def test_fetch_twitter(dataset_name):
-    try:
-        dataset = fetch_twitter(dataset_name)
-    except ValueError as ve:
-        if dataset_name == 'semeval' and ve.args[0].startswith(
-                'dataset "semeval" can only be used for model selection.'):
-            dataset = fetch_twitter(dataset_name, for_model_selection=True)
-    print(f'Dataset {dataset_name}')
-    print('Training set stats')
-    dataset.training.stats()
-    print('Test set stats')
-
-
-@pytest.mark.parametrize('dataset_name', UCI_BINARY_DATASETS)
-def test_fetch_UCIDataset(dataset_name):
-    try:
-        dataset = fetch_UCIBinaryDataset(dataset_name)
-    except FileNotFoundError as fnfe:
-        if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
-                'If this is the first time you attempt to load this dataset') > 0:
-            print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.')
-            return
-    print(f'Dataset {dataset_name}')
-    print('Training set stats')
-    dataset.training.stats()
-    print('Test set stats')
-
-
-@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS)
-def test_fetch_UCIMultiDataset(dataset_name):
-    dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
-    print(f'Dataset {dataset_name}')
-    print('Training set stats')
-    dataset.stats()
-    print('Test set stats')
-
-
-@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
-def test_fetch_lequa2022(dataset_name):
-    train, gen_val, gen_test = fetch_lequa2022(dataset_name)
-    print(train.stats())
-    print('Val:', gen_val.total())
-    print('Test:', gen_test.total())
diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py
deleted file mode 100644
index 0797729..0000000
--- a/quapy/tests/test_hierarchy.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import unittest
-from sklearn.linear_model import LogisticRegression
-
-from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS
-from quapy.method.aggregative import *
-import inspect
-
-
-class HierarchyTestCase(unittest.TestCase):
-
-    def test_aggregative(self):
-        lr = LogisticRegression()
-        for m in AGGREGATIVE_METHODS:
-            self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True)
-
-    def test_inspect_aggregative(self):
-
-        import quapy.method.aggregative as aggregative
-
-        members = inspect.getmembers(aggregative)
-        classes = set([cls for name, cls in members if inspect.isclass(cls)])
-        quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)]
-        quantifiers = [cls for cls in quantifiers if issubclass(cls, AggregativeQuantifier)]
-        quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ]
-
-        for cls in quantifiers:
-            self.assertIn(cls, AGGREGATIVE_METHODS)
-
-    def test_binary(self):
-        lr = LogisticRegression()
-        for m in BINARY_METHODS:
-            self.assertEqual(isinstance(m(lr), BinaryQuantifier), True)
-
-    def test_inspect_binary(self):
-
-        import quapy.method.base as base
-        import quapy.method.aggregative as aggregative
-        import quapy.method.non_aggregative as non_aggregative
-        import quapy.method.meta as meta
-
-        members = inspect.getmembers(base)
-        members+= inspect.getmembers(aggregative)
-        members += inspect.getmembers(non_aggregative)
-        members += inspect.getmembers(meta)
-        classes = set([cls for name, cls in members if inspect.isclass(cls)])
-        quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)]
-        quantifiers = [cls for cls in quantifiers if issubclass(cls, BinaryQuantifier)]
-        quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ]
-
-        for cls in quantifiers:
-            self.assertIn(cls, BINARY_METHODS)
-
-    def test_probabilistic(self):
-        lr = LogisticRegression()
-        for m in [CC(lr), ACC(lr)]:
-            self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True)
-            self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False)
-        for m in [PCC(lr), PACC(lr)]:
-            self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False)
-            self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True)
-
-
-if __name__ == '__main__':
-    unittest.main()
-
diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py
deleted file mode 100644
index 3fbe991..0000000
--- a/quapy/tests/test_methods.py
+++ /dev/null
@@ -1,234 +0,0 @@
-import numpy as np
-import pytest
-from sklearn.linear_model import LogisticRegression
-from sklearn.svm import LinearSVC
-
-import method.aggregative
-import quapy as qp
-from quapy.model_selection import GridSearchQ
-from quapy.method.base import BinaryQuantifier
-from quapy.data import Dataset, LabelledCollection
-from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
-from quapy.method.meta import Ensemble
-from quapy.protocol import APP
-from quapy.method.aggregative import DMy
-from quapy.method.meta import MedianEstimator
-
-# datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
-#             pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
-
-tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'),
-                pytest.param(qp.datasets.fetch_UCIBinaryDataset('ionosphere').reduce(), id='tiny_ionosphere')]
-
-learners = [LogisticRegression, LinearSVC]
-
-
-@pytest.mark.parametrize('dataset', tinydatasets)
-@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS)
-@pytest.mark.parametrize('learner', learners)
-def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
-    model = aggregative_method(learner())
-
-    if isinstance(model, BinaryQuantifier) and not dataset.binary:
-        print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}')
-        return
-
-    model.fit(dataset.training)
-
-    estim_prevalences = model.quantify(dataset.test.instances)
-
-    true_prevalences = dataset.test.prevalence()
-    error = qp.error.mae(true_prevalences, estim_prevalences)
-
-    assert type(error) == np.float64
-
-
-@pytest.mark.parametrize('dataset', tinydatasets)
-@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS)
-def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
-    model = non_aggregative_method()
-
-    if isinstance(model, BinaryQuantifier) and not dataset.binary:
-        print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
-        return
-
-    model.fit(dataset.training)
-
-    estim_prevalences = model.quantify(dataset.test.instances)
-
-    true_prevalences = dataset.test.prevalence()
-    error = qp.error.mae(true_prevalences, estim_prevalences)
-
-    assert type(error) == np.float64
-
-
-@pytest.mark.parametrize('base_method', [method.aggregative.ACC, method.aggregative.PACC])
-@pytest.mark.parametrize('learner', [LogisticRegression])
-@pytest.mark.parametrize('dataset', tinydatasets)
-@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
-def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
-
-    qp.environ['SAMPLE_SIZE'] = 20
-
-    base_quantifier=base_method(learner())
-
-    if not dataset.binary and policy=='ds':
-        print(f'skipping the test of binary policy ds on non-binary dataset {dataset}')
-        return
-
-    model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
-
-    model.fit(dataset.training)
-
-    estim_prevalences = model.quantify(dataset.test.instances)
-
-    true_prevalences = dataset.test.prevalence()
-    error = qp.error.mae(true_prevalences, estim_prevalences)
-
-    assert type(error) == np.float64
-
-
-def test_quanet_method():
-    try:
-        import quapy.classification.neural
-    except ModuleNotFoundError:
-        print('skipping QuaNet test due to missing torch package')
-        return
-
-    qp.environ['SAMPLE_SIZE'] = 100
-
-    # load the kindle dataset as text, and convert words to numerical indexes
-    dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200)
-    qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
-
-    from quapy.classification.neural import CNNnet
-    cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
-
-    from quapy.classification.neural import NeuralClassifierTrainer
-    learner = NeuralClassifierTrainer(cnn, device='cuda')
-
-    from quapy.method.meta import QuaNet
-    model = QuaNet(learner, device='cuda')
-
-    if isinstance(model, BinaryQuantifier) and not dataset.binary:
-        print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
-        return
-
-    model.fit(dataset.training)
-
-    estim_prevalences = model.quantify(dataset.test.instances)
-
-    true_prevalences = dataset.test.prevalence()
-    error = qp.error.mae(true_prevalences, estim_prevalences)
-
-    assert type(error) == np.float64
-
-
-def test_str_label_names():
-    model = qp.method.aggregative.CC(LogisticRegression())
-
-    dataset = qp.datasets.fetch_reviews('imdb', pickle=True)
-    dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()),
-                      dataset.test.sampling(1000, 0.25, 0.75))
-    qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
-
-    np.random.seed(0)
-    model.fit(dataset.training)
-
-    int_estim_prevalences = model.quantify(dataset.test.instances)
-    true_prevalences = dataset.test.prevalence()
-
-    error = qp.error.mae(true_prevalences, int_estim_prevalences)
-    assert type(error) == np.float64
-
-    dataset_str = Dataset(LabelledCollection(dataset.training.instances,
-                                             ['one' if label == 1 else 'zero' for label in dataset.training.labels]),
-                          LabelledCollection(dataset.test.instances,
-                                             ['one' if label == 1 else 'zero' for label in dataset.test.labels]))
-    assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation'
-    np.random.seed(0)
-    model.fit(dataset_str.training)
-
-    str_estim_prevalences = model.quantify(dataset_str.test.instances)
-    true_prevalences = dataset_str.test.prevalence()
-
-    error = qp.error.mae(true_prevalences, str_estim_prevalences)
-    assert type(error) == np.float64
-
-    print(true_prevalences)
-    print(int_estim_prevalences)
-    print(str_estim_prevalences)
-
-    np.testing.assert_almost_equal(int_estim_prevalences[1],
-                                      str_estim_prevalences[list(model.classes_).index('one')])
-
-# helper
-def __fit_test(quantifier, train, test):
-    quantifier.fit(train)
-    test_samples = APP(test)
-    true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples)
-    return qp.error.mae(true_prevs, estim_prevs), estim_prevs
-
-
-def test_median_meta():
-    """
-    This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions
-    of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is
-    computed across different values of nbins
-    """
-
-    qp.environ['SAMPLE_SIZE'] = 100
-
-    # grid of values
-    nbins_grid = list(range(2, 11))
-
-    dataset = 'kindle'
-    train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
-    prevs = []
-    errors = []
-    for nbins in nbins_grid:
-        with qp.util.temp_seed(0):
-            q = DMy(LogisticRegression(), nbins=nbins)
-            mae, estim_prevs = __fit_test(q, train, test)
-            prevs.append(estim_prevs)
-            errors.append(mae)
-            print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}')
-    prevs = np.asarray(prevs)
-    mae = np.mean(errors)
-    print(f'\tMAE={mae:.4f}')
-
-    q = DMy(LogisticRegression())
-    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
-    median_mae, prev = __fit_test(q, train, test)
-    print(f'\tMAE={median_mae:.4f}')
-
-    np.testing.assert_almost_equal(np.median(prevs, axis=0), prev)
-    assert median_mae < mae, 'the median-based quantifier provided a higher error...'
-
-
-def test_median_meta_modsel():
-    """
-    This test checks the median-meta quantifier with model selection
-    """
-
-    qp.environ['SAMPLE_SIZE'] = 100
-
-    dataset = 'kindle'
-    train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
-    train, val = train.split_stratified(random_state=0)
-
-    nbins_grid = [2, 4, 5, 10, 15]
-
-    q = DMy(LogisticRegression())
-    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
-    median_mae, _ = __fit_test(q, train, test)
-    print(f'\tMAE={median_mae:.4f}')
-
-    q = DMy(LogisticRegression())
-    lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
-    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
-    q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
-    optimized_median_ave, _ = __fit_test(q, train, test)
-    print(f'\tMAE={optimized_median_ave:.4f}')
-
-    assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."
\ No newline at end of file
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
index 75cfaaf..fe416c7 100644
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@@ -2,9 +2,9 @@ import unittest
 
 import numpy as np
 from sklearn.linear_model import LogisticRegression
-from sklearn.svm import SVC
 
 import quapy as qp
+import util
 from quapy.method.aggregative import PACC
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import APP
@@ -14,13 +14,16 @@ import time
 class ModselTestCase(unittest.TestCase):
 
     def test_modsel(self):
+        """
+        Checks whether a model selection exploration takes a good hyperparameter
+        """
 
         q = PACC(LogisticRegression(random_state=1, max_iter=5000))
 
-        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce()
         training, validation = data.training.split_stratified(0.7, random_state=1)
 
-        param_grid = {'classifier__C': np.logspace(-3,3,7)}
+        param_grid = {'classifier__C': [0.000001, 10.]}
         app = APP(validation, sample_size=100, random_state=1)
         q = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
@@ -32,54 +35,40 @@ class ModselTestCase(unittest.TestCase):
         self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0)
 
     def test_modsel_parallel(self):
+        """
+        Checks whether a parallelized model selection actually is faster than a sequential exploration but
+        obtains the same optimal parameters
+        """
 
         q = PACC(LogisticRegression(random_state=1, max_iter=5000))
 
-        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500)
         training, validation = data.training.split_stratified(0.7, random_state=1)
-        # test = data.test
 
         param_grid = {'classifier__C': np.logspace(-3,3,7)}
         app = APP(validation, sample_size=100, random_state=1)
-        q = GridSearchQ(
+
+        print('starting model selection in sequential exploration')
+        tinit = time.time()
+        modsel = GridSearchQ(
+            q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=1, verbose=True
+        ).fit(training)
+        tend_seq = time.time()-tinit
+        best_c_seq = modsel.best_params_['classifier__C']
+        print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}')
+
+        print('starting model selection in parallel exploration')
+        tinit = time.time()
+        modsel = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
         ).fit(training)
-        print('best params', q.best_params_)
-        print('best score', q.best_score_)
+        tend_par = time.time() - tinit
+        best_c_par = modsel.best_params_['classifier__C']
+        print(f'[done] took {tend_par:.2f}s best C = {best_c_par}')
 
-        self.assertEqual(q.best_params_['classifier__C'], 10.0)
-        self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0)
+        self.assertEqual(best_c_seq, best_c_par)
+        self.assertLess(tend_par, tend_seq)
 
-    def test_modsel_parallel_speedup(self):
-        class SlowLR(LogisticRegression):
-            def fit(self, X, y, sample_weight=None):
-                time.sleep(1)
-                return super(SlowLR, self).fit(X, y, sample_weight)
-
-        q = PACC(SlowLR(random_state=1, max_iter=5000))
-
-        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
-        training, validation = data.training.split_stratified(0.7, random_state=1)
-
-        param_grid = {'classifier__C': np.logspace(-3, 3, 7)}
-        app = APP(validation, sample_size=100, random_state=1)
-
-        tinit = time.time()
-        GridSearchQ(
-            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
-        ).fit(training)
-        tend_nooptim = time.time()-tinit
-
-        tinit = time.time()
-        GridSearchQ(
-            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
-        ).fit(training)
-        tend_optim = time.time() - tinit
-
-        print(f'parallel training took {tend_optim:.4f}s')
-        print(f'sequential training took {tend_nooptim:.4f}s')
-
-        self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
 
     def test_modsel_timeout(self):
 
@@ -91,11 +80,10 @@ class ModselTestCase(unittest.TestCase):
 
         q = PACC(SlowLR())
 
-        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce()
         training, validation = data.training.split_stratified(0.7, random_state=1)
-        # test = data.test
 
-        param_grid = {'classifier__C': np.logspace(-3,3,7)}
+        param_grid = {'classifier__C': np.logspace(-1,1,3)}
         app = APP(validation, sample_size=100, random_state=1)
 
         print('Expecting TimeoutError to be raised')
diff --git a/quapy/tests/test_replicability.py b/quapy/tests/test_replicability.py
index 8633fc4..434d44b 100644
--- a/quapy/tests/test_replicability.py
+++ b/quapy/tests/test_replicability.py
@@ -8,7 +8,7 @@ from quapy.method.aggregative import PACC
 import quapy.functional as F
 
 
-class MyTestCase(unittest.TestCase):
+class TestReplicability(unittest.TestCase):
 
     def test_prediction_replicability(self):
 
@@ -26,7 +26,7 @@ class MyTestCase(unittest.TestCase):
             prev2 = pacc.fit(dataset.training).quantify(dataset.test.X)
             str_prev2 = strprev(prev2, prec=5)
 
-        self.assertEqual(str_prev1, str_prev2)  # add assertion here
+        self.assertEqual(str_prev1, str_prev2)
 
 
     def test_samping_replicability(self):
@@ -78,7 +78,7 @@ class MyTestCase(unittest.TestCase):
 
     def test_parallel_replicability(self):
 
-        train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test
+        train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').reduce().train_test
 
         test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0])