From 265fcc2d92f225abf39f2946b8aa764502016994 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Sun, 13 Jul 2025 14:27:14 +0200
Subject: [PATCH] tests passed; working on examples

---
 TODO.txt                                 |  4 +++
 examples/1.model_selection.py            |  6 ++++
 examples/2.custom_quantifier.py          | 10 +++---
 examples/4.lequa2022_experiments.py      |  7 ++--
 examples/4b.lequa2024_experiments.py     |  3 +-
 examples/5.explicit_loss_minimization.py |  4 +--
 quapy/data/datasets.py                   | 46 +++++++++++++++++++-----
 quapy/method/aggregative.py              |  2 ++
 quapy/model_selection.py                 |  2 +-
 quapy/tests/test_datasets.py             | 39 ++++++++++++--------
 quapy/tests/test_evaluation.py           | 10 +++---
 quapy/tests/test_methods.py              |  7 ++--
 quapy/tests/test_modsel.py               | 10 +++---
 quapy/tests/test_protocols.py            |  2 +-
 quapy/tests/test_replicability.py        | 17 ++++-----
 15 files changed, 113 insertions(+), 56 deletions(-)

diff --git a/TODO.txt b/TODO.txt
index ef49173..0e3b6af 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,3 +1,7 @@
+Adapt examples; remaining: example 4-onwards
+
+Add 'platt' to calib options in EMQ?
+
 Allow n_prevpoints in APP to be specified by a user-defined grid?
 
 Update READMEs, wiki, & examples for new fit-predict interface
diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py
index 94225df..47a7620 100644
--- a/examples/1.model_selection.py
+++ b/examples/1.model_selection.py
@@ -23,6 +23,12 @@ print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
 
 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
 
+# evaluation in terms of MAE with default hyperparameters
+model.fit(*training.Xy)
+mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
+print(f'MAE (non optimized)={mae_score:.5f}')
+
+
 with qp.util.temp_seed(0):
 
     # The model will be returned by the fit method of GridSearchQ.
diff --git a/examples/2.custom_quantifier.py b/examples/2.custom_quantifier.py
index 09fa71f..ac6f7b5 100644
--- a/examples/2.custom_quantifier.py
+++ b/examples/2.custom_quantifier.py
@@ -31,8 +31,7 @@ class MyQuantifier(BaseQuantifier):
         self.alpha = alpha
         self.classifier = classifier
 
-    # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
-    # val_split=None); this would amount to:
+    # in general, we would need to implement the method fit(self, X, y); this would amount to:
     def fit(self, X, y):
         n_classes = F.num_classes_from_labels(y)
         assert n_classes==2, \
@@ -61,8 +60,9 @@ class MyQuantifier(BaseQuantifier):
 class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
 
     def __init__(self, classifier, alpha=0.5):
-        # aggregative quantifiers have an internal attribute called self.classifier
-        self.classifier = classifier
+        # aggregative quantifiers have an internal attribute called self.classifier, but this is defined
+        # within the super's init
+        super().__init__(classifier, fit_classifier=True, val_split=None)
         self.alpha = alpha
 
     # since this method is of type aggregative, we can simply implement the method aggregation_fit, which
@@ -144,7 +144,7 @@ if __name__ == '__main__':
         evaluation took 4.66s [MAE = 0.0630]
     """
     # Note that the first implementation is much slower, both in terms of grid-search optimization and in terms of
-    # evaluation. The reason why is that QuaPy is highly optimized for aggregative quantifiers (by far, the most
+    # evaluation. The reason why, is that QuaPy is highly optimized for aggregative quantifiers (by far, the most
     # popular type of quantification methods), thus significantly speeding up model selection and test routines.
     # Furthermore, it is simpler to extend an aggregation type since QuaPy implements boilerplate functions for you.
 
diff --git a/examples/4.lequa2022_experiments.py b/examples/4.lequa2022_experiments.py
index f3eec55..152f072 100644
--- a/examples/4.lequa2022_experiments.py
+++ b/examples/4.lequa2022_experiments.py
@@ -15,7 +15,7 @@ https://lequa2022.github.io/index (the site of the competition)
 https://ceur-ws.org/Vol-3180/paper-146.pdf (the overview paper)
 """
 
-# there are 4 tasks (T1A, T1B, T2A, T2B)
+# there are 4 tasks (T1A, T1B, T2A, T2B), let us symply consider T1A (binary quantification, vector form)
 task = 'T1A'
 
 # set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing:
@@ -28,6 +28,7 @@ qp.environ['N_JOBS'] = -1
 # of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
 # stored in a directory.
 training, val_generator, test_generator = fetch_lequa2022(task=task)
+Xtr, ytr = training.Xy
 
 # define the quantifier
 quantifier = EMQ(classifier=LogisticRegression())
@@ -36,10 +37,10 @@ quantifier = EMQ(classifier=LogisticRegression())
 param_grid = {
     'classifier__C': np.logspace(-3, 3, 7),          # classifier-dependent: inverse of regularization strength
     'classifier__class_weight': ['balanced', None],  # classifier-dependent: weights of each class
-    'recalib': ['bcts', 'platt', None]               # quantifier-dependent: recalibration method (new in v0.1.7)
+    'calib': ['bcts', None]                 # quantifier-dependent: recalibration method (new in v0.1.7)
 }
 model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
-quantifier = model_selection.fit(training)
+quantifier = model_selection.fit(Xtr, ytr)
 
 # evaluation
 report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
diff --git a/examples/4b.lequa2024_experiments.py b/examples/4b.lequa2024_experiments.py
index 38394e3..c5b6f92 100644
--- a/examples/4b.lequa2024_experiments.py
+++ b/examples/4b.lequa2024_experiments.py
@@ -27,6 +27,7 @@ qp.environ['N_JOBS'] = -1
 # of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
 # stored in a directory.
 training, val_generator, test_generator = fetch_lequa2024(task=task)
+Xtr, ytr = training.Xy
 
 # define the quantifier
 quantifier = KDEyML(classifier=LogisticRegression())
@@ -38,7 +39,7 @@ param_grid = {
     'bandwidth': np.linspace(0.01, 0.2, 20)  # quantifier-dependent: bandwidth of the kernel
 }
 model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
-quantifier = model_selection.fit(training)
+quantifier = model_selection.fit(Xtr, ytr)
 
 # evaluation
 report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
diff --git a/examples/5.explicit_loss_minimization.py b/examples/5.explicit_loss_minimization.py
index f8f210d..aee318e 100644
--- a/examples/5.explicit_loss_minimization.py
+++ b/examples/5.explicit_loss_minimization.py
@@ -58,11 +58,11 @@ param_grid = {
 }
 print('starting model selection')
 model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False)
-quantifier = model_selection.fit(train_modsel).best_model()
+quantifier = model_selection.fit(*train_modsel.Xy).best_model()
 
 print('training on the whole training set')
 train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test
-quantifier.fit(train)
+quantifier.fit(*train.Xy)
 
 # evaluation
 mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae')
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index b7fd81a..a6760a3 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -792,7 +792,7 @@ def _array_replace(arr, repl={"yes": 1, "no": 0}):
 
 def fetch_lequa2022(task, data_home=None):
     """
-    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
+    Loads the official datasets provided for the `LeQua 2022 <https://lequa2022.github.io/index>`_ competition.
     In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
     problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
     Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
@@ -812,7 +812,7 @@ def fetch_lequa2022(task, data_home=None):
         ~/quay_data/ directory)
     :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
         :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
-        :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
+        :class:`quapy.data._lequa.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
         that return a series of samples stored in a directory which are labelled by prevalence.
     """
 
@@ -834,7 +834,9 @@ def fetch_lequa2022(task, data_home=None):
         tmp_path = join(lequa_dir, task + '_tmp.zip')
         download_file_if_not_exists(url, tmp_path)
         with zipfile.ZipFile(tmp_path) as file:
+            print(f'Unzipping {tmp_path}...', end='')
             file.extractall(unzipped_path)
+            print(f'[done]')
         os.remove(tmp_path)
 
     if not os.path.exists(join(lequa_dir, task)):
@@ -862,6 +864,35 @@ def fetch_lequa2022(task, data_home=None):
 
 
 def fetch_lequa2024(task, data_home=None, merge_T3=False):
+    """
+    Loads the official datasets provided for the `LeQua 2024 <https://lequa2024.github.io/index>`_ competition.
+    LeQua 2024 defines four tasks (T1, T2, T3, T4) related to the problem of quantification;
+    all tasks are affected by some type of dataset shift. Tasks T1 and T2 are akin to tasks T1A and T1B of LeQua 2022,
+    while T3 and T4 are new tasks introduced in LeQua 2024.
+
+    - Task T1 evaluates binary quantifiers under prior probability shift (akin to T1A of LeQua 2022).
+    - Task T2 evaluates single-label multi-class quantifiers (for n > 2 classes) under prior probability shift (akin to T1B of LeQua 2022).
+    - Task T3 evaluates ordinal quantifiers, where the classes are totally ordered.
+    - Task T4 also evaluates binary quantifiers, but under some mix of covariate shift and prior probability shift.
+
+    For a broader discussion, we refer to the `online official documentation <https://lequa2024.github.io/tasks/>`_
+
+    The datasets are downloaded only once, and stored locally for future reuse.
+
+    See `4b.lequa2024_experiments.py` provided in the example folder, which can serve as a guide on how to use these
+    datasets.
+
+    :param task: a string representing the task name; valid ones are T1, T2, T3, and T4
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quapy_data/ directory)
+    :param merge_T3: bool, if False (default), returns a generator of training collections, corresponding to natural
+        groups of reviews; if True, returns one single :class:`quapy.data.base.LabelledCollection` representing the
+        entire training set, as a concatenation of all the training collections
+    :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
+        :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
+        :class:`quapy.data._lequa.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
+        that return a series of samples stored in a directory which are labelled by prevalence.
+    """
 
     from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir
 
@@ -904,11 +935,7 @@ def fetch_lequa2024(task, data_home=None, merge_T3=False):
     test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
     test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
 
-    if task != 'T3':
-        tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
-        train = LabelledCollection.load(tr_path, loader_func=load_fn)
-        return train, val_gen, test_gen
-    else:
+    if task == 'T3':
         training_samples_path = join(lequa_dir, task, 'public', 'training_samples')
         training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt')
         train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn)
@@ -917,7 +944,10 @@ def fetch_lequa2024(task, data_home=None, merge_T3=False):
             return train, val_gen, test_gen
         else:
             return train_gen, val_gen, test_gen
-
+    else:
+        tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
+        train = LabelledCollection.load(tr_path, loader_func=load_fn)
+        return train, val_gen, test_gen
 
 
 def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index e890be9..f71dd93 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -784,6 +784,8 @@ class EMQ(AggregativeSoftQuantifier):
     def _fit_calibration(self, calibrator, P, y):
         n_classes = len(self.classes_)
 
+        print(y, 'Y')
+        print(y.dtype, 'DTYPE')
         if not np.issubdtype(y.dtype, np.number):
             y = np.searchsorted(self.classes_, y)
 
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index c8183f2..8d29877 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -372,7 +372,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
     total_prev = np.zeros(shape=data.n_classes)
 
     for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
-        quantifier.fit(train)
+        quantifier.fit(*train.Xy)
         fold_prev = quantifier.predict(test.X)
         rel_size = 1. * len(test) / len(data)
         total_prev += fold_prev*rel_size
diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py
index a1910d5..cc09f16 100644
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@@ -52,18 +52,12 @@ class TestDatasets(unittest.TestCase):
 
     def test_UCIBinaryDataset(self):
         for dataset_name in UCI_BINARY_DATASETS:
-            try:
-                print(f'loading dataset {dataset_name}...', end='')
-                dataset = fetch_UCIBinaryDataset(dataset_name)
-                dataset.stats()
-                dataset.reduce()
-                print(f'[done]')
-                self._check_dataset(dataset)
-            except FileNotFoundError as fnfe:
-                if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
-                        'If this is the first time you attempt to load this dataset') > 0:
-                    print('The pageblocks.5 dataset requires some hand processing to be usable; skipping this test.')
-                    continue
+            print(f'loading dataset {dataset_name}...', end='')
+            dataset = fetch_UCIBinaryDataset(dataset_name)
+            dataset.stats()
+            dataset.reduce()
+            print(f'[done]')
+            self._check_dataset(dataset)
 
     def test_UCIMultiDataset(self):
         for dataset_name in UCI_MULTICLASS_DATASETS:
@@ -83,7 +77,7 @@ class TestDatasets(unittest.TestCase):
             return
 
         for dataset_name in LEQUA2022_VECTOR_TASKS:
-            print(f'loading dataset {dataset_name}...', end='')
+            print(f'LeQu2022: loading dataset {dataset_name}...', end='')
             train, gen_val, gen_test = fetch_lequa2022(dataset_name)
             train.stats()
             n_classes = train.n_classes
@@ -94,7 +88,7 @@ class TestDatasets(unittest.TestCase):
             self._check_samples(gen_test, q, max_samples_test=5)
 
         for dataset_name in LEQUA2022_TEXT_TASKS:
-            print(f'loading dataset {dataset_name}...', end='')
+            print(f'LeQu2022: loading dataset {dataset_name}...', end='')
             train, gen_val, gen_test = fetch_lequa2022(dataset_name)
             train.stats()
             n_classes = train.n_classes
@@ -106,6 +100,23 @@ class TestDatasets(unittest.TestCase):
             self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf)
             self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf)
 
+    def test_lequa2024(self):
+        if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
+            print("omitting test_lequa2024 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set")
+            return
+
+        for task in LEQUA2024_TASKS:
+            print(f'LeQu2024: loading task {task}...', end='')
+            train, gen_val, gen_test = fetch_lequa2024(task, merge_T3=True)
+            train.stats()
+            n_classes = train.n_classes
+            train = train.sampling(100, *F.uniform_prevalence(n_classes))
+            q = self.new_quantifier()
+            q.fit(*train.Xy)
+            self._check_samples(gen_val, q, max_samples_test=5)
+            self._check_samples(gen_test, q, max_samples_test=5)
+
+
     def test_IFCB(self):
         if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
             print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set")
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
index 480faa1..05d661a 100644
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@@ -29,7 +29,7 @@ class EvalTestCase(unittest.TestCase):
                 time.sleep(1)
                 return super().predict_proba(X)
 
-        emq = EMQ(SlowLR()).fit(train)
+        emq = EMQ(SlowLR()).fit(*train.Xy)
 
         tinit = time()
         score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
@@ -44,11 +44,11 @@ class EvalTestCase(unittest.TestCase):
             def predict(self, X):
                 return self.emq.predict(X)
 
-            def fit(self, data):
-                self.emq.fit(data)
+            def fit(self, X, y):
+                self.emq.fit(X, y)
                 return self
 
-        emq = NonAggregativeEMQ(SlowLR()).fit(train)
+        emq = NonAggregativeEMQ(SlowLR()).fit(*train.Xy)
 
         tinit = time()
         score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
@@ -69,7 +69,7 @@ class EvalTestCase(unittest.TestCase):
 
         protocol = qp.protocol.APP(test, random_state=0)
 
-        q = PCC(LogisticRegression()).fit(train)
+        q = PCC(LogisticRegression()).fit(*train.Xy)
 
         single_errors = list(QUANTIFICATION_ERROR_SINGLE_NAMES)
         averaged_errors = ['m'+e for e in single_errors]
diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py
index a156d8c..533bf1a 100644
--- a/quapy/tests/test_methods.py
+++ b/quapy/tests/test_methods.py
@@ -10,15 +10,17 @@ from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_ME
 from quapy.functional import check_prevalence_vector
 
 # a random selection of composed methods to test the qunfold integration
+from quapy.method.composable import check_compatible_qunfold_version
+
 from quapy.method.composable import (
     ComposableQuantifier,
     LeastSquaresLoss,
     HellingerSurrogateLoss,
     ClassTransformer,
     HistogramTransformer,
-    CVClassifier,
-    check_compatible_qunfold_version
+    CVClassifier
 )
+
 COMPOSABLE_METHODS = [
     ComposableQuantifier( # ACC
         LeastSquaresLoss(),
@@ -70,7 +72,6 @@ class TestMethods(unittest.TestCase):
                 self.assertTrue(check_prevalence_vector(estim_prevalences))
 
     def test_ensembles(self):
-
         qp.environ['SAMPLE_SIZE'] = 10
 
         base_quantifier = ACC(LogisticRegression())
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
index 36b35ca..c13b665 100644
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@@ -26,7 +26,7 @@ class ModselTestCase(unittest.TestCase):
         app = APP(validation, sample_size=100, random_state=1)
         q = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, verbose=True, n_jobs=-1
-        ).fit(training)
+        ).fit(*training.Xy)
         print('best params', q.best_params_)
         print('best score', q.best_score_)
 
@@ -51,7 +51,7 @@ class ModselTestCase(unittest.TestCase):
         tinit = time.time()
         modsel = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
-        ).fit(training)
+        ).fit(*training.Xy)
         tend_seq = time.time()-tinit
         best_c_seq = modsel.best_params_['classifier__C']
         print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}')
@@ -60,7 +60,7 @@ class ModselTestCase(unittest.TestCase):
         tinit = time.time()
         modsel = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
-        ).fit(training)
+        ).fit(*training.Xy)
         tend_par = time.time() - tinit
         best_c_par = modsel.best_params_['classifier__C']
         print(f'[done] took {tend_par:.2f}s best C = {best_c_par}')
@@ -90,7 +90,7 @@ class ModselTestCase(unittest.TestCase):
             q, param_grid, protocol=app, timeout=3, n_jobs=-1, verbose=True, raise_errors=True
         )
         with self.assertRaises(TimeoutError):
-            modsel.fit(training)
+            modsel.fit(*training.Xy)
 
         print('Expecting ValueError to be raised')
         modsel = GridSearchQ(
@@ -99,7 +99,7 @@ class ModselTestCase(unittest.TestCase):
         with self.assertRaises(ValueError):
             # this exception is not raised because of the timeout, but because no combination of hyperparams
             # succedded (in this case, a ValueError is raised, regardless of "raise_errors"
-            modsel.fit(training)
+            modsel.fit(*training.Xy)
 
 
 if __name__ == '__main__':
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
index 87bd358..4850bd4 100644
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@@ -71,7 +71,7 @@ class TestProtocols(unittest.TestCase):
         # surprisingly enough, for some n_prevalences the test fails, notwithstanding
         # everything is correct. The problem is that in function APP.prevalence_grid()
         # there is sometimes one rounding error that gets cumulated and
-        # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like)
+        # surpasses 1.0 (by a very small float value, 0.0000000000002 or the like)
         # so these tuples are mistakenly removed... I have tried with np.close, and
         # other workarounds, but eventually happens that there is some negative probability
         # in the sampling function...
diff --git a/quapy/tests/test_replicability.py b/quapy/tests/test_replicability.py
index db603b7..a174992 100644
--- a/quapy/tests/test_replicability.py
+++ b/quapy/tests/test_replicability.py
@@ -13,17 +13,18 @@ class TestReplicability(unittest.TestCase):
     def test_prediction_replicability(self):
 
         dataset = qp.datasets.fetch_UCIBinaryDataset('yeast')
+        train, test = dataset.train_test
 
         with qp.util.temp_seed(0):
             lr = LogisticRegression(random_state=0, max_iter=10000)
             pacc = PACC(lr)
-            prev = pacc.fit(dataset.training).predict(dataset.test.X)
+            prev = pacc.fit(*train.Xy).predict(test.X)
             str_prev1 = strprev(prev, prec=5)
 
         with qp.util.temp_seed(0):
             lr = LogisticRegression(random_state=0, max_iter=10000)
             pacc = PACC(lr)
-            prev2 = pacc.fit(dataset.training).predict(dataset.test.X)
+            prev2 = pacc.fit(*train.Xy).predict(test.X)
             str_prev2 = strprev(prev2, prec=5)
 
         self.assertEqual(str_prev1, str_prev2)
@@ -83,18 +84,18 @@ class TestReplicability(unittest.TestCase):
         test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0])
 
         with qp.util.temp_seed(10):
-            pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
-            pacc.fit(train, val_split=0.5)
+            pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2)
+            pacc.fit(*train.Xy)
             prev1 = F.strprev(pacc.predict(test.instances))
 
         with qp.util.temp_seed(0):
-            pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
-            pacc.fit(train, val_split=0.5)
+            pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2)
+            pacc.fit(*train.Xy)
             prev2 = F.strprev(pacc.predict(test.instances))
 
         with qp.util.temp_seed(0):
-            pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
-            pacc.fit(train, val_split=0.5)
+            pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2)
+            pacc.fit(*train.Xy)
             prev3 = F.strprev(pacc.predict(test.instances))
 
         print(prev1)