diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
new file mode 100644
index 0000000..a372109
--- /dev/null
+++ b/quapy/CHANGE_LOG.txt
@@ -0,0 +1,34 @@
+# main changes in 0.1.7
+
+- Protocols is now an abstraction, AbstractProtocol. There is a new class extending AbstractProtocol called
+    AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings.
+    There are some examples of protocols, APP, NPP, USimplexPP, CovariateShiftPP (experimental).
+    The idea is to start the sampling by simpli calling the __call__ method.
+    This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection,
+    and sampling functions in LabelledCollection make use of the old functions.
+
+- ACC, PACC, Forman's threshold variants have been parallelized.
+
+
+Things to fix:
+- eval budget policy?
+- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
+- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
+    internally and not imposed in any abstract class)
+- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
+- update unit tests
+- Policies should be able to set their output to "labelled_collection" or "instances_prevalence" or something similar.
+- Policies should implement the "gen()" one, taking a reader function as an input, and a folder path maybe
+- Review all documentation, redo the Sphinx doc, update Wikis...
+- Resolve the OneVsAll thing (it is in base.py and in aggregative.py
+- Better handle the environment (e.g., with n_jobs)
+- test cross_generate_predictions and cancel cross_generate_predictions_depr
+- Add a proper log?
+- test LoadSamplesFromDirectory (in protocols.py)
+- improve plots?
+- I have removed the distinction between "classify" and "posterior_probabilities" in the Aggregative quantifiers,
+    so that probabilistic classifiers actually return posterior probabilities, while non-probabilistic quantifiers
+    return instead crisp decisions. The idea was to unify the quantification function (i.e., now it is always
+    classify & aggregate, irrespective of the class). However, this has caused a problem with OneVsAll. This has to
+    be checked, since it is now innecessarily complicated (it also has old references to .probabilistic, and all this
+    stuff).
\ No newline at end of file
diff --git a/quapy/__init__.py b/quapy/__init__.py
index ad69ae9..2ef4c5c 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -2,13 +2,13 @@ from . import error
 from . import data
 from quapy.data import datasets
 from . import functional
-from . import method
+# from . import method
 from . import evaluation
+from . import protocol
 from . import plot
 from . import util
 from . import model_selection
 from . import classification
-from quapy.method.base import isprobabilistic, isaggregative
 
 __version__ = '0.1.7'
 
@@ -21,5 +21,4 @@ environ = {
     'SVMPERF_HOME': './svm_perf_quantification'
 }
 
-def isbinary(x):
-    return x.binary
\ No newline at end of file
+
diff --git a/quapy/data/base.py b/quapy/data/base.py
index cfe2891..c555692 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -210,10 +210,12 @@ class LabelledCollection:
         :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
             second one with `1-train_prop` elements
         """
-        tr_docs, te_docs, tr_labels, te_labels = \
-            train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
-                             random_state=random_state)
-        return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
+        tr_docs, te_docs, tr_labels, te_labels = train_test_split(
+            self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state
+        )
+        training = LabelledCollection(tr_docs, tr_labels, classes_=self.classes_)
+        test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
+        return training, test
 
     def __add__(self, other):
         """
@@ -418,13 +420,3 @@ class Dataset:
             yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
 
 
-def isbinary(data):
-    """
-    Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
-
-    :param data: a :class:`Dataset` or a :class:`LabelledCollection` object
-    :return: True if labelled according to two classes
-    """
-    if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
-        return data.binary
-    return False
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
new file mode 100644
index 0000000..0ea417d
--- /dev/null
+++ b/quapy/evaluation.py
@@ -0,0 +1,102 @@
+from typing import Union, Callable, Iterable
+import numpy as np
+from tqdm import tqdm
+import inspect
+import quapy as qp
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier
+from quapy.util import temp_seed
+import quapy.functional as F
+import pandas as pd
+
+
+def prediction(model: BaseQuantifier, protocol: AbstractProtocol, verbose=False):
+    sout = lambda x: print(x) if verbose else None
+    from method.aggregative import AggregativeQuantifier
+    if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
+        sout('speeding up the prediction for the aggregative quantifier')
+        pre_classified = model.classify(protocol.get_labelled_collection().instances)
+        return __prediction_helper(model.aggregate, protocol.on_preclassified_instances(pre_classified), verbose)
+    else:
+        sout(f'the method is not aggregative, or the protocol is not an instance of '
+             f'{OnLabelledCollectionProtocol.__name__}, so no optimization can be carried out')
+        return __prediction_helper(model.quantify, protocol, verbose)
+
+
+def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
+    true_prevs, estim_prevs = [], []
+    for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
+        estim_prevs.append(quantification_fn(sample.instances))
+        true_prevs.append(sample.prevalence())
+
+    true_prevs = np.asarray(true_prevs)
+    estim_prevs = np.asarray(estim_prevs)
+
+    return true_prevs, estim_prevs
+
+
+def evaluation_report(model: BaseQuantifier,
+                      protocol: AbstractProtocol,
+                      error_metrics:Iterable[Union[str,Callable]]='mae',
+                      verbose=False):
+
+    true_prevs, estim_prevs = prediction(model, protocol, verbose)
+    return _prevalence_report(true_prevs, estim_prevs, error_metrics)
+
+
+def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[str, Callable]] = 'mae'):
+
+    if isinstance(error_metrics, str):
+        error_metrics = [error_metrics]
+
+    error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
+    assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
+    error_names = [e.__name__ for e in error_funcs]
+
+    df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names)
+    for true_prev, estim_prev in zip(true_prevs, estim_prevs):
+        series = {'true-prev': true_prev, 'estim-prev': estim_prev}
+        for error_name, error_metric in zip(error_names, error_funcs):
+            score = error_metric(true_prev, estim_prev)
+            series[error_name] = score
+        df = df.append(series, ignore_index=True)
+
+    return df
+
+
+def evaluate(model: BaseQuantifier, protocol: AbstractProtocol, error_metric:Union[str, Callable], verbose=False):
+    if isinstance(error_metric, str):
+        error_metric = qp.error.from_name(error_metric)
+    true_prevs, estim_prevs = prediction(model, protocol, verbose)
+    return error_metric(true_prevs, estim_prevs)
+
+
+
+def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, repeats=1, verbose=False):
+    if n_prevpoints is None and eval_budget is None:
+        raise ValueError('either n_prevpoints or eval_budget has to be specified')
+    elif n_prevpoints is None:
+        assert eval_budget > 0, 'eval_budget must be a positive integer'
+        n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
+        eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+        if verbose:
+            print(f'setting n_prevpoints={n_prevpoints} so that the number of '
+                  f'evaluations ({eval_computations}) does not exceed the evaluation '
+                  f'budget ({eval_budget})')
+    elif eval_budget is None:
+        eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+        if verbose:
+            print(f'{eval_computations} evaluations will be performed for each '
+                  f'combination of hyper-parameters')
+    else:
+        eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+        if eval_computations > eval_budget:
+            n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
+            new_eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+            if verbose:
+                print(f'the budget of evaluations would be exceeded with '
+                  f'n_prevpoints={n_prevpoints}. Chaning to n_prevpoints={n_prevpoints}. This will produce '
+                  f'{new_eval_computations} evaluation computations for each hyper-parameter combination.')
+    return n_prevpoints, eval_computations
+
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index c0280a2..ea9cbc0 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -1,15 +1,13 @@
 from abc import abstractmethod
 from copy import deepcopy
 from typing import Union
-
 import numpy as np
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import StratifiedKFold, cross_val_predict
 from tqdm import tqdm
-
 import quapy as qp
 import quapy.functional as F
 from quapy.classification.svmperf import SVMperf
@@ -61,7 +59,9 @@ class AggregativeQuantifier(BaseQuantifier):
 
     def classify(self, instances):
         """
-        Provides the label predictions for the given instances.
+        Provides the label predictions for the given instances. The predictions should respect the format expected by
+        :meth:`aggregate`, i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for
+        non-probabilistic quantifiers
 
         :param instances: array-like
         :return: np.ndarray of shape `(n_instances,)` with label predictions
@@ -118,16 +118,6 @@ class AggregativeQuantifier(BaseQuantifier):
         """
         return self.learner.classes_
 
-    @property
-    def aggregative(self):
-        """
-        Returns True, indicating the quantifier is of type aggregative.
-
-        :return: True
-        """
-
-        return True
-
 
 class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
     """
@@ -137,28 +127,25 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
     probabilities.
     """
 
-    def posterior_probabilities(self, instances):
+    def classify(self, instances):
         return self.learner.predict_proba(instances)
 
-    def predict_proba(self, instances):
-        return self.posterior_probabilities(instances)
-
-    def quantify(self, instances):
-        classif_posteriors = self.posterior_probabilities(instances)
-        return self.aggregate(classif_posteriors)
-
     def set_params(self, **parameters):
         if isinstance(self.learner, CalibratedClassifierCV):
             parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
         self.learner.set_params(**parameters)
 
-    @property
-    def probabilistic(self):
-        return True
-
 
 # Helper
 # ------------------------------------
+def _ensure_probabilistic(learner):
+    if not hasattr(learner, 'predict_proba'):
+        print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
+              f'The learner will be calibrated.')
+        learner = CalibratedClassifierCV(learner, cv=5)
+    return learner
+
+
 def _training_helper(learner,
                      data: LabelledCollection,
                      fit_learner: bool = True,
@@ -180,10 +167,7 @@ def _training_helper(learner,
     """
     if fit_learner:
         if ensure_probabilistic:
-            if not hasattr(learner, 'predict_proba'):
-                print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
-                      f'The learner will be calibrated.')
-                learner = CalibratedClassifierCV(learner, cv=5)
+            learner = _ensure_probabilistic(learner)
         if val_split is not None:
             if isinstance(val_split, float):
                 if not (0 < val_split < 1):
@@ -214,6 +198,89 @@ def _training_helper(learner,
     return learner, unused
 
 
+def cross_generate_predictions(
+        data,
+        learner,
+        val_split,
+        probabilistic,
+        fit_learner,
+        n_jobs
+):
+
+    if isinstance(val_split, int):
+        assert fit_learner == True, \
+            'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
+
+        if probabilistic:
+            learner = _ensure_probabilistic(learner)
+            predict = 'predict_proba'
+        else:
+            predict = 'predict'
+        y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
+        class_count = data.counts()
+
+        # fit the learner on all data
+        learner.fit(*data.Xy)
+        classes = data.classes_
+    else:
+        learner, val_data = _training_helper(
+            learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
+        )
+        y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances)
+        y = val_data.labels
+        classes = val_data.classes_
+        class_count = val_data.counts()
+
+    return learner, y, y_pred, classes, class_count
+
+
+def cross_generate_predictions_depr(
+        data,
+        learner,
+        val_split,
+        probabilistic,
+        fit_learner,
+        method_name=''
+):
+    predict = learner.predict_proba if probabilistic else learner.predict
+    if isinstance(val_split, int):
+        assert fit_learner == True, \
+            'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
+        # kFCV estimation of parameters
+        y, y_ = [], []
+        kfcv = StratifiedKFold(n_splits=val_split)
+        pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
+        for k, (training_idx, validation_idx) in enumerate(pbar):
+            pbar.set_description(f'{method_name}\tfitting fold {k}')
+            training = data.sampling_from_index(training_idx)
+            validation = data.sampling_from_index(validation_idx)
+            learner, val_data = _training_helper(
+                learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation
+            )
+            y_.append(predict(val_data.instances))
+            y.append(val_data.labels)
+
+        y = np.concatenate(y)
+        y_ = np.concatenate(y_)
+        class_count = data.counts()
+
+        # fit the learner on all data
+        learner, _ = _training_helper(
+            learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None
+        )
+        classes = data.classes_
+
+    else:
+        learner, val_data = _training_helper(
+            learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
+        )
+        y_ = predict(val_data.instances)
+        y = val_data.labels
+        classes = val_data.classes_
+        class_count = val_data.counts()
+
+    return learner, y, y_, classes, class_count
+
 # Methods
 # ------------------------------------
 class CC(AggregativeQuantifier):
@@ -264,9 +331,10 @@ class ACC(AggregativeQuantifier):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, learner: BaseEstimator, val_split=0.4):
+    def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
         self.learner = learner
         self.val_split = val_split
+        self.n_jobs = n_jobs
 
     def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
         """
@@ -280,44 +348,33 @@ class ACC(AggregativeQuantifier):
             cross validation to estimate the parameters
         :return: self
         """
+
         if val_split is None:
             val_split = self.val_split
-        if isinstance(val_split, int):
-            assert fit_learner == True, \
-                'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
-            # kFCV estimation of parameters
-            y, y_ = [], []
-            kfcv = StratifiedKFold(n_splits=val_split)
-            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
-            for k, (training_idx, validation_idx) in enumerate(pbar):
-                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
-                training = data.sampling_from_index(training_idx)
-                validation = data.sampling_from_index(validation_idx)
-                learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
-                y_.append(learner.predict(val_data.instances))
-                y.append(val_data.labels)
 
-            y = np.concatenate(y)
-            y_ = np.concatenate(y_)
-            class_count = data.counts()
-
-            # fit the learner on all data
-            self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
-
-        else:
-            self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
-            y_ = self.learner.predict(val_data.instances)
-            y = val_data.labels
-            class_count = val_data.counts()
+        self.learner, y, y_, classes, class_count = cross_generate_predictions(
+            data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs
+        )
 
         self.cc = CC(self.learner)
-
-        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
-        # document that belongs to yj ends up being classified as belonging to yi
-        self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
+        self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
 
         return self
 
+    @classmethod
+    def getPteCondEstim(cls, classes, y, y_):
+        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
+        # document that belongs to yj ends up being classified as belonging to yi
+        conf = confusion_matrix(y, y_, labels=classes).T
+        conf = conf.astype(np.float)
+        class_counts = conf.sum(axis=0)
+        for i, _ in enumerate(classes):
+            if class_counts[i] == 0:
+                conf[i, i] = 1
+            else:
+                conf[:, i] /= class_counts[i]
+        return conf
+
     def classify(self, data):
         return self.cc.classify(data)
 
@@ -380,9 +437,10 @@ class PACC(AggregativeProbabilisticQuantifier):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, learner: BaseEstimator, val_split=0.4):
+    def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
         self.learner = learner
         self.val_split = val_split
+        self.n_jobs = n_jobs
 
     def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
         """
@@ -396,52 +454,31 @@ class PACC(AggregativeProbabilisticQuantifier):
          to estimate the parameters
         :return: self
         """
+
         if val_split is None:
             val_split = self.val_split
 
-        if isinstance(val_split, int):
-            assert fit_learner == True, \
-                'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
-            # kFCV estimation of parameters
-            y, y_ = [], []
-            kfcv = StratifiedKFold(n_splits=val_split)
-            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
-            for k, (training_idx, validation_idx) in enumerate(pbar):
-                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
-                training = data.sampling_from_index(training_idx)
-                validation = data.sampling_from_index(validation_idx)
-                learner, val_data = _training_helper(
-                    self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
-                y_.append(learner.predict_proba(val_data.instances))
-                y.append(val_data.labels)
-
-            y = np.concatenate(y)
-            y_ = np.vstack(y_)
-
-            # fit the learner on all data
-            self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
-                                               val_split=None)
-            classes = data.classes_
-
-        else:
-            self.learner, val_data = _training_helper(
-                self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
-            y_ = self.learner.predict_proba(val_data.instances)
-            y = val_data.labels
-            classes = val_data.classes_
+        self.learner, y, y_, classes, class_count = cross_generate_predictions(
+            data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
+        )
 
         self.pcc = PCC(self.learner)
+        self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
 
+        return self
+
+    @classmethod
+    def getPteCondEstim(cls, classes, y, y_):
         # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
         # document that belongs to yj ends up being classified as belonging to yi
         n_classes = len(classes)
-        confusion = np.empty(shape=(n_classes, n_classes))
+        confusion = np.eye(n_classes)
         for i, class_ in enumerate(classes):
-            confusion[i] = y_[y == class_].mean(axis=0)
+            idx = y == class_
+            if idx.any():
+                confusion[i] = y_[idx].mean(axis=0)
 
-        self.Pte_cond_estim_ = confusion.T
-
-        return self
+        return confusion.T
 
     def aggregate(self, classif_posteriors):
         prevs_estim = self.pcc.aggregate(classif_posteriors)
@@ -557,7 +594,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
         self._check_binary(data, self.__class__.__name__)
         self.learner, validation = _training_helper(
             self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
-        Px = self.posterior_probabilities(validation.instances)[:, 1]  # takes only the P(y=+1|x)
+        Px = self.classify(validation.instances)[:, 1]  # takes only the P(y=+1|x)
         self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
         self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
         # pre-compute the histogram for positive and negative examples
@@ -732,44 +769,24 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, learner: BaseEstimator, val_split=0.4):
+    def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
         self.learner = learner
         self.val_split = val_split
+        self.n_jobs = n_jobs
 
     def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
         self._check_binary(data, "Threshold Optimization")
 
         if val_split is None:
             val_split = self.val_split
-        if isinstance(val_split, int):
-            assert fit_learner == True, \
-                'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
-            # kFCV estimation of parameters
-            y, probabilities = [], []
-            kfcv = StratifiedKFold(n_splits=val_split)
-            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
-            for k, (training_idx, validation_idx) in enumerate(pbar):
-                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
-                training = data.sampling_from_index(training_idx)
-                validation = data.sampling_from_index(validation_idx)
-                learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
-                probabilities.append(learner.predict_proba(val_data.instances))
-                y.append(val_data.labels)
 
-            y = np.concatenate(y)
-            probabilities = np.concatenate(probabilities)
-
-            # fit the learner on all data
-            self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
-
-        else:
-            self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
-            probabilities = self.learner.predict_proba(val_data.instances)
-            y = val_data.labels
+        self.learner, y, y_, classes, class_count = cross_generate_predictions(
+            data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
+        )
 
         self.cc = CC(self.learner)
 
-        self.tpr, self.fpr = self._optimize_threshold(y, probabilities)
+        self.tpr, self.fpr = self._optimize_threshold(y, y_)
 
         return self
 
@@ -828,7 +845,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
 
     def _compute_tpr(self, TP, FP):
         if TP + FP == 0:
-            return 0
+            return 1
         return TP / (TP + FP)
 
     def _compute_fpr(self, FP, TN):
@@ -1022,54 +1039,59 @@ class OneVsAll(AggregativeQuantifier):
 
     def classify(self, instances):
         """
-        Returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry
-        `(i,j)` is a binary value indicating whether instance `i `belongs to class `j`. The binary classifications are
-        independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
+        If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
+        instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
+        `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
+        can end up be attributed to 0, 1, or more classes.
+        If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
+        and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
+        posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
+        probabilities are independent of each other, meaning that, in general, they do not sum up to one.
 
         :param instances: array-like
         :return: `np.ndarray`
         """
 
-        classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
-        return classif_predictions_bin.T
-
-    def posterior_probabilities(self, instances):
-        """
-        Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
-        `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
-        (resp. does not belong) to class `j`.
-        The posterior probabilities are independent of each other, meaning that, in general, they do not sum
-        up to one.
-
-        :param instances: array-like
-        :return: `np.ndarray`
-        """
-
-        if not self.binary_quantifier.probabilistic:
-            raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
-                                      f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
-                                      f'probabilistic')
-        posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
-        return np.swapaxes(posterior_predictions_bin, 0, 1)
-
-    def aggregate(self, classif_predictions_bin):
-        if self.probabilistic:
-            assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \
-                'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
-                'probabilities (2 dimensions) for each document (row) and class (columns)'
+        classif_predictions = self.__parallel(self._delayed_binary_classification, instances)
+        if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
+            return np.swapaxes(classif_predictions, 0, 1)
         else:
-            assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \
-                'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
-                'predictions for each document (row) and class (columns)'
-        prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
+            return classif_predictions.T
+    #
+    # def posterior_probabilities(self, instances):
+    #     """
+    #     Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
+    #     `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
+    #     (resp. does not belong) to class `j`.
+    #     The posterior probabilities are independent of each other, meaning that, in general, they do not sum
+    #     up to one.
+    #
+    #     :param instances: array-like
+    #     :return: `np.ndarray`
+    #     """
+    #
+    #     if not isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
+    #         raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
+    #                                   f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
+    #                                   f'probabilistic')
+    #     posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
+    #     return np.swapaxes(posterior_predictions_bin, 0, 1)
+
+    def aggregate(self, classif_predictions):
+        # if self.probabilistic:
+        #     assert classif_predictions.shape[1] == self.n_classes and classif_predictions.shape[2] == 2, \
+        #         'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
+        #         'probabilities (2 dimensions) for each document (row) and class (columns)'
+        # else:
+        #     assert set(np.unique(classif_predictions)).issubset({0, 1}), \
+        #         'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
+        #         'predictions for each document (row) and class (columns)'
+        prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions)
         return F.normalize_prevalence(prevalences)
 
-    def quantify(self, X):
-        if self.probabilistic:
-            predictions = self.posterior_probabilities(X)
-        else:
-            predictions = self.classify(X)
-        return self.aggregate(predictions)
+    # def quantify(self, X):
+    #     predictions = self.classify(X)
+    #     return self.aggregate(predictions)
 
     def __parallel(self, func, *args, **kwargs):
         return np.asarray(
@@ -1093,9 +1115,6 @@ class OneVsAll(AggregativeQuantifier):
     def _delayed_binary_classification(self, c, X):
         return self.dict_binary_quantifiers[c].classify(X)
 
-    def _delayed_binary_posteriors(self, c, X):
-        return self.dict_binary_quantifiers[c].posterior_probabilities(X)
-
     def _delayed_binary_aggregate(self, c, classif_predictions):
         # the estimation for the positive class prevalence
         return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
@@ -1104,21 +1123,3 @@ class OneVsAll(AggregativeQuantifier):
         bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
         self.dict_binary_quantifiers[c].fit(bindata)
 
-    @property
-    def binary(self):
-        """
-        Informs that the classifier is not binary
-
-        :return: False
-        """
-        return False
-
-    @property
-    def probabilistic(self):
-        """
-        Indicates if the classifier is probabilistic or not (depending on the nature of the base classifier).
-
-        :return: boolean
-        """
-
-        return self.binary_quantifier.probabilistic
diff --git a/quapy/method/base.py b/quapy/method/base.py
index 4a4962a..55e18c7 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -51,56 +51,6 @@ class BaseQuantifier(metaclass=ABCMeta):
         """
         ...
 
-    @property
-    @abstractmethod
-    def classes_(self):
-        """
-        Class labels, in the same order in which class prevalence values are to be computed.
-
-        :return: array-like
-        """
-        ...
-
-    @property
-    def n_classes(self):
-        """
-        Returns the number of classes
-
-        :return: integer
-        """
-        return len(self.classes_)
-
-    # these methods allows meta-learners to reimplement the decision based on their constituents, and not
-    # based on class structure
-    @property
-    def binary(self):
-        """
-        Indicates whether the quantifier is binary or not.
-
-        :return: False (to be overridden)
-        """
-        return False
-
-    @property
-    def aggregative(self):
-        """
-        Indicates whether the quantifier is of type aggregative or not
-
-        :return: False (to be overridden)
-        """
-
-        return False
-
-    @property
-    def probabilistic(self):
-        """
-        Indicates whether the quantifier is of type probabilistic or not
-
-        :return: False (to be overridden)
-        """
-
-        return False
-
 
 class BinaryQuantifier(BaseQuantifier):
     """
@@ -112,46 +62,8 @@ class BinaryQuantifier(BaseQuantifier):
         assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
                             f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
 
-    @property
-    def binary(self):
-        """
-        Informs that the quantifier is binary
-
-        :return: True
-        """
-        return True
 
 
-def isbinary(model:BaseQuantifier):
-    """
-    Alias for property `binary`
-
-    :param model: the model
-    :return: True if the model is binary, False otherwise
-    """
-    return model.binary
-
-
-def isaggregative(model:BaseQuantifier):
-    """
-    Alias for property `aggregative`
-
-    :param model: the model
-    :return: True if the model is aggregative, False otherwise
-    """
-
-    return model.aggregative
-
-
-def isprobabilistic(model:BaseQuantifier):
-    """
-    Alias for property `probabilistic`
-
-    :param model: the model
-    :return: True if the model is probabilistic, False otherwise
-    """
-
-    return model.probabilistic
 
 
 # class OneVsAll:
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index 3504301..3e57652 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -234,19 +234,6 @@ class Ensemble(BaseQuantifier):
         order = np.argsort(dist)
         return _select_k(predictions, order, k=self.red_size)
 
-    @property
-    def classes_(self):
-        return self.base_quantifier.classes_
-
-    @property
-    def binary(self):
-        """
-        Returns a boolean indicating whether the base quantifiers are binary or not
-
-        :return: boolean
-        """
-        return self.base_quantifier.binary
-
     @property
     def aggregative(self):
         """
diff --git a/quapy/method/neural.py b/quapy/method/neural.py
index bf1f375..0665634 100644
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@@ -191,7 +191,7 @@ class QuaNetTrainer(BaseQuantifier):
         label_predictions = np.argmax(posteriors, axis=-1)
         prevs_estim = []
         for quantifier in self.quantifiers.values():
-            predictions = posteriors if quantifier.probabilistic else label_predictions
+            predictions = posteriors if isinstance(quantifier, AggregativeProbabilisticQuantifier) else label_predictions
             prevs_estim.extend(quantifier.aggregate(predictions))
 
         # there is no real need for adding static estims like the TPR or FPR from training since those are constant
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index eef811b..c1fa817 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -2,14 +2,12 @@ import itertools
 import signal
 from copy import deepcopy
 from typing import Union, Callable
-
-import numpy as np
-
+import evaluation
 import quapy as qp
+from protocol import AbstractProtocol, OnLabelledCollectionProtocol
 from quapy.data.base import LabelledCollection
-from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
 from quapy.method.aggregative import BaseQuantifier
-import inspect
+from time import time
 
 
 class GridSearchQ(BaseQuantifier):
@@ -21,33 +19,11 @@ class GridSearchQ(BaseQuantifier):
     :param model: the quantifier to optimize
     :type model: BaseQuantifier
     :param param_grid: a dictionary with keys the parameter names and values the list of values to explore
-    :param sample_size: the size of the samples to extract from the validation set (ignored if protocol='gen')
-    :param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
-        protocol, or 'gen' for using a custom sampling generator function
-    :param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
-        [0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
-        each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested.
-        Ignored if protocol!='app'.
-    :param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
-        for the protocol='app' if eval_budget is set and is lower than the number of combinations that would be
-        generated using the value assigned to n_prevpoints (for the current number of classes and n_repetitions).
-        Ignored for protocol='npp' and protocol='gen' (use eval_budget for setting a maximum number of samples in
-        those cases).
-    :param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
-        combination. For example, if protocol='app', there are 3 classes, n_repetitions=1 and eval_budget=20, then
-        n_prevpoints will be set to 5, since this will generate 15 different prevalences, i.e., [0, 0, 1],
-        [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0], and since setting it to 6 would generate more than
-        20. When protocol='gen', indicates the maximum number of samples to generate, but less samples will be
-        generated if the generator yields less samples.
+    :param protocol:
     :param error: an error function (callable) or a string indicating the name of an error function (valid ones
         are those in qp.error.QUANTIFICATION_ERROR
     :param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
         the best chosen hyperparameter combination. Ignored if protocol='gen'
-    :param val_split: either a LabelledCollection on which to test the performance of the different settings, or
-        a float in [0,1] indicating the proportion of labelled data to extract from the training set, or a callable
-        returning a generator function each time it is invoked (only for protocol='gen').
-    :param n_jobs: number of parallel jobs
-    :param random_seed: set the seed of the random generator to replicate experiments. Ignored if protocol='gen'.
     :param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
         Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
         being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
@@ -57,65 +33,27 @@ class GridSearchQ(BaseQuantifier):
     def __init__(self,
                  model: BaseQuantifier,
                  param_grid: dict,
-                 sample_size: Union[int, None],
-                 protocol='app',
-                 n_prevpoints: int = None,
-                 n_repetitions: int = 1,
-                 eval_budget: int = None,
+                 protocol: AbstractProtocol,
                  error: Union[Callable, str] = qp.error.mae,
                  refit=True,
-                 val_split=0.4,
-                 n_jobs=1,
-                 random_seed=42,
                  timeout=-1,
+                 n_jobs=1,
                  verbose=False):
 
         self.model = model
         self.param_grid = param_grid
-        self.sample_size = sample_size
-        self.protocol = protocol.lower()
-        self.n_prevpoints = n_prevpoints
-        self.n_repetitions = n_repetitions
-        self.eval_budget = eval_budget
+        self.protocol = protocol
         self.refit = refit
-        self.val_split = val_split
-        self.n_jobs = n_jobs
-        self.random_seed = random_seed
         self.timeout = timeout
+        self.n_jobs = n_jobs
         self.verbose = verbose
         self.__check_error(error)
-        assert self.protocol in {'app', 'npp', 'gen'}, \
-            'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
-            'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
-            'sample (instances) and their prevalence (ndarray) at each iteration.'
-        assert self.eval_budget is None or isinstance(self.eval_budget, int)
-        if self.protocol in ['npp', 'gen']:
-            if self.protocol=='npp' and (self.eval_budget is None or self.eval_budget <= 0):
-                raise ValueError(f'when protocol="npp" the parameter eval_budget should be '
-                                 f'indicated (and should be >0).')
-            if self.n_repetitions != 1:
-                print('[warning] n_repetitions has been set and will be ignored for the selected protocol')
+        assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
 
     def _sout(self, msg):
         if self.verbose:
             print(f'[{self.__class__.__name__}]: {msg}')
 
-    def __check_training_validation(self, training, validation):
-        if isinstance(validation, LabelledCollection):
-            return training, validation
-        elif isinstance(validation, float):
-            assert 0. < validation < 1., 'validation proportion should be in (0,1)'
-            training, validation = training.split_stratified(train_prop=1 - validation)
-            return training, validation
-        elif self.protocol=='gen' and inspect.isgenerator(validation()):
-            return training, validation
-        else:
-            raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
-                             f'proportion of training documents to extract (type found: {type(validation)}). '
-                             f'Optionally, "validation" can be a callable function returning a generator that yields '
-                             f'the sample instances along with their true prevalence at each iteration by '
-                             f'setting protocol="gen".')
-
     def __check_error(self, error):
         if error in qp.error.QUANTIFICATION_ERROR:
             self.error = error
@@ -127,96 +65,86 @@ class GridSearchQ(BaseQuantifier):
             raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
                              f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
 
-    def __generate_predictions(self, model, val_split):
-        commons = {
-            'n_repetitions': self.n_repetitions,
-            'n_jobs': self.n_jobs,
-            'random_seed': self.random_seed,
-            'verbose': False
-        }
-        if self.protocol == 'app':
-            return artificial_prevalence_prediction(
-                model, val_split, self.sample_size,
-                n_prevpoints=self.n_prevpoints,
-                eval_budget=self.eval_budget,
-                **commons
-            )
-        elif self.protocol == 'npp':
-            return natural_prevalence_prediction(
-                model, val_split, self.sample_size,
-                **commons)
-        elif self.protocol == 'gen':
-            return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
-        else:
-            raise ValueError('unknown protocol')
-
-    def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
+    def fit(self, training: LabelledCollection):
         """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
             the error metric.
 
         :param training: the training set on which to optimize the hyperparameters
-        :param val_split: either a LabelledCollection on which to test the performance of the different settings, or
-            a float in [0,1] indicating the proportion of labelled data to extract from the training set
         :return: self
         """
-        if val_split is None:
-            val_split = self.val_split
-        training, val_split = self.__check_training_validation(training, val_split)
-        if self.protocol != 'gen':
-            assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
-
         params_keys = list(self.param_grid.keys())
         params_values = list(self.param_grid.values())
 
-        model = self.model
-
-        if self.timeout > 0:
-            def handler(signum, frame):
-                self._sout('timeout reached')
-                raise TimeoutError()
-
-            signal.signal(signal.SIGALRM, handler)
+        protocol = self.protocol
+        n_jobs = self.n_jobs
 
         self.param_scores_ = {}
         self.best_score_ = None
-        some_timeouts = False
-        for values in itertools.product(*params_values):
-            params = dict({k: values[i] for i, k in enumerate(params_keys)})
 
-            if self.timeout > 0:
-                signal.alarm(self.timeout)
+        hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
+        scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
 
-            try:
-                # overrides default parameters with the parameters being explored at this iteration
-                model.set_params(**params)
-                model.fit(training)
-                true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
-                score = self.error(true_prevalences, estim_prevalences)
-
-                self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
+        for params, score, model in scores:
+            if score is not None:
                 if self.best_score_ is None or score < self.best_score_:
                     self.best_score_ = score
                     self.best_params_ = params
-                    self.best_model_ = deepcopy(model)
+                    self.best_model_ = model
                 self.param_scores_[str(params)] = score
+            else:
+                self.param_scores_[str(params)] = 'timeout'
 
-                if self.timeout > 0:
-                    signal.alarm(0)
-            except TimeoutError:
-                print(f'timeout reached for config {params}')
-                some_timeouts = True
-
-        if self.best_score_ is None and some_timeouts:
+        if self.best_score_ is None:
             raise TimeoutError('all jobs took more than the timeout time to end')
 
         self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
 
         if self.refit:
-            self._sout(f'refitting on the whole development set')
-            self.best_model_.fit(training + val_split)
+            if isinstance(protocol, OnLabelledCollectionProtocol):
+                self._sout(f'refitting on the whole development set')
+                self.best_model_.fit(training + protocol.get_labelled_collection())
+            else:
+                raise RuntimeWarning(f'"refit" was requested, but the protocol does not '
+                                     f'implement the {OnLabelledCollectionProtocol.__name__} interface')
 
         return self
 
+    def _delayed_eval(self, args):
+        params, training = args
+
+        protocol = self.protocol
+        error = self.error
+
+        if self.timeout > 0:
+            def handler(signum, frame):
+                raise TimeoutError()
+
+            signal.signal(signal.SIGALRM, handler)
+
+        tinit = time()
+
+        if self.timeout > 0:
+            signal.alarm(self.timeout)
+
+        try:
+            model = deepcopy(self.model)
+            # overrides default parameters with the parameters being explored at this iteration
+            model.set_params(**params)
+            model.fit(training)
+            score = evaluation.evaluate(model, protocol=protocol, error_metric=error)
+
+            ttime = time()-tinit
+            self._sout(f'hyperparams={params}\t got {error.__name__} score {score:.5f} [took {ttime:.4f}s]')
+
+            if self.timeout > 0:
+                signal.alarm(0)
+        except TimeoutError:
+            self._sout(f'timeout ({self.timeout}s) reached for config {params}')
+            score = None
+
+        return params, score, model
+
+
     def quantify(self, instances):
         """Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
 
@@ -227,14 +155,6 @@ class GridSearchQ(BaseQuantifier):
         assert hasattr(self, 'best_model_'), 'quantify called before fit'
         return self.best_model().quantify(instances)
 
-    @property
-    def classes_(self):
-        """
-        Classes on which the quantifier has been trained on.
-        :return: a ndarray of shape `(n_classes)` with the class identifiers
-        """
-        return self.best_model().classes_
-
     def set_params(self, **parameters):
         """Sets the hyper-parameters to explore.
 
@@ -260,3 +180,5 @@ class GridSearchQ(BaseQuantifier):
         if hasattr(self, 'best_model_'):
             return self.best_model_
         raise ValueError('best_model called before fit')
+
+
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 43bb0ef..70a98d9 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -1,12 +1,16 @@
+from copy import deepcopy
+
+import quapy as qp
 import numpy as np
 import itertools
 from collections.abc import Generator
 from contextlib import ExitStack
 from abc import ABCMeta, abstractmethod
-
 from quapy.data import LabelledCollection
 import quapy.functional as F
 from tqdm import tqdm
+from os.path import exists
+from glob import glob
 
 
 # 0.1.7
@@ -61,6 +65,8 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
         the sequence will be different every time the protocol is called.
     """
 
+    _random_seed = -1  # means "not set"
+
     def __init__(self, seed=None):
         self.random_seed = seed
 
@@ -93,13 +99,47 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
 
     def __call__(self):
         with ExitStack() as stack:
+            if self.random_seed == -1:
+                raise ValueError('The random seed has never been initialized. '
+                                 'Set it to None not to impose replicability.')
             if self.random_seed is not None:
                 stack.enter_context(qp.util.temp_seed(self.random_seed))
             for params in self.samples_parameters():
                 yield self.sample(params)
 
 
-class APP(AbstractStochasticSeededProtocol):
+class OnLabelledCollectionProtocol:
+    def get_labelled_collection(self):
+        return self.data
+
+    def on_preclassified_instances(self, pre_classifications, in_place=False):
+        assert len(pre_classifications) == len(self.data), \
+            f'error: the pre-classified data has different shape ' \
+            f'(expected {len(self.data)}, found {len(pre_classifications)})'
+        if in_place:
+            self.data.instances = pre_classifications
+            return self
+        else:
+            new = deepcopy(self)
+            return new.on_preclassified_instances(pre_classifications, in_place=True)
+
+
+class LoadSamplesFromDirectory(AbstractProtocol):
+
+    def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
+        assert exists(folder_path), f'folder {folder_path} does not exist'
+        assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
+        self.folder_path = folder_path
+        self.loader_fn = loader_fn
+        self.classes = classes
+        self.loader_kwargs = loader_kwargs
+
+    def __call__(self):
+        for file in sorted(glob(self.folder_path, '*')):
+            yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
+
+
+class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
     """
     Implementation of the artificial prevalence protocol (APP).
     The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
@@ -123,7 +163,7 @@ class APP(AbstractStochasticSeededProtocol):
         self.n_prevalences = n_prevalences
         self.repeats = repeats
 
-    def prevalence_grid(self, dimensions):
+    def prevalence_grid(self):
         """
         Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
         number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
@@ -134,14 +174,14 @@ class APP(AbstractStochasticSeededProtocol):
         to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
         1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
 
-        :param dimensions: the number of classes
         :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
             `(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
             in the grid multiplied by `repeat`
         """
+        dimensions = self.data.n_classes
         s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
         s = [s] * (dimensions - 1)
-        prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
+        prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) <= 1.0)]
         prevs = np.asarray(prevs).reshape(len(prevs), -1)
         if self.repeats > 1:
             prevs = np.repeat(prevs, self.repeats, axis=0)
@@ -149,8 +189,8 @@ class APP(AbstractStochasticSeededProtocol):
 
     def samples_parameters(self):
         indexes = []
-        for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
-            index = data.sampling_index(self.sample_size, *prevs)
+        for prevs in self.prevalence_grid():
+            index = self.data.sampling_index(self.sample_size, *prevs)
             indexes.append(index)
         return indexes
 
@@ -161,7 +201,7 @@ class APP(AbstractStochasticSeededProtocol):
         return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
 
 
-class NPP(AbstractStochasticSeededProtocol):
+class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
     """
     A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
     samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
@@ -182,7 +222,7 @@ class NPP(AbstractStochasticSeededProtocol):
     def samples_parameters(self):
         indexes = []
         for _ in range(self.repeats):
-            index = data.uniform_sampling_index(self.sample_size)
+            index = self.data.uniform_sampling_index(self.sample_size)
             indexes.append(index)
         return indexes
 
@@ -193,8 +233,7 @@ class NPP(AbstractStochasticSeededProtocol):
         return self.repeats
 
 
-
-class USimplexPP(AbstractStochasticSeededProtocol):
+class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
     """
     A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
     relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
@@ -218,8 +257,8 @@ class USimplexPP(AbstractStochasticSeededProtocol):
 
     def samples_parameters(self):
         indexes = []
-        for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
-            index = data.sampling_index(self.sample_size, *prevs)
+        for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats):
+            index = self.data.sampling_index(self.sample_size, *prevs)
             indexes.append(index)
         return indexes
 
@@ -230,7 +269,6 @@ class USimplexPP(AbstractStochasticSeededProtocol):
         return self.repeats
 
 
-
 class CovariateShiftPP(AbstractStochasticSeededProtocol):
     """
     Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
@@ -300,33 +338,3 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
         return self.repeats * len(self.mixture_points)
 
 
-
-
-if __name__=='__main__':
-    import numpy as np
-    import quapy as qp
-
-    # domainA
-    y = [0]*25 + [1]*25 + [2]*25 + [3]*25
-    X = ['A:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
-    data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
-
-    # domain B
-    y = [0]*25 + [1]*25 + [2]*25 + [3]*25
-    X = ['B:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
-    dataB = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
-
-    # p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
-    # p = NPP(data, sample_size=10, repeats=10, random_seed=42)
-    # p = NPP(data, sample_size=10, repeats=10)
-    # p = USimplexPP(data, sample_size=10, repeats=10)
-    p = CovariateShiftPP(data, dataB, sample_size=10, mixture_points=11, random_seed=1)
-
-    for _ in range(2):
-        print('init generator', p.__class__.__name__)
-        for i in tqdm(p(), total=p.total()):
-            # print(i)
-            print(i.instances, i.labels, i.prevalence())
-
-    print('done')
-
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
new file mode 100644
index 0000000..de6603b
--- /dev/null
+++ b/quapy/tests/test_evaluation.py
@@ -0,0 +1,57 @@
+import unittest
+import quapy as qp
+from sklearn.linear_model import LogisticRegression
+from time import time
+from method.aggregative import EMQ
+from method.base import BaseQuantifier
+
+
+class EvalTestCase(unittest.TestCase):
+    def test_eval_speedup(self):
+
+        data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
+        train, test = data.training, data.test
+
+        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
+
+        class SlowLR(LogisticRegression):
+            def predict_proba(self, X):
+                import time
+                time.sleep(1)
+                return super().predict_proba(X)
+
+        emq = EMQ(SlowLR()).fit(train)
+
+        tinit = time()
+        score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+        tend_optim = time()-tinit
+        print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
+
+        class NonAggregativeEMQ(BaseQuantifier):
+
+            def __init__(self, cls):
+                self.emq = EMQ(cls)
+
+            def quantify(self, instances):
+                return self.emq.quantify(instances)
+
+            def fit(self, data):
+                self.emq.fit(data)
+                return self
+
+            def set_params(self, **parameters): pass
+            def get_params(self, deep=True): pass
+
+
+        emq = NonAggregativeEMQ(SlowLR()).fit(train)
+
+        tinit = time()
+        score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+        tend_no_optim = time() - tinit
+        print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
+
+        self.assertEqual(tend_no_optim>tend_optim, True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py
new file mode 100644
index 0000000..21af4b6
--- /dev/null
+++ b/quapy/tests/test_hierarchy.py
@@ -0,0 +1,32 @@
+import unittest
+
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+from quapy.method.aggregative import *
+
+
+
+class HierarchyTestCase(unittest.TestCase):
+
+    def test_aggregative(self):
+        lr = LogisticRegression()
+        for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]:
+            self.assertEqual(isinstance(m, AggregativeQuantifier), True)
+
+    def test_binary(self):
+        lr = LogisticRegression()
+        for m in [HDy(lr)]:
+            self.assertEqual(isinstance(m, BinaryQuantifier), True)
+
+    def test_probabilistic(self):
+        lr = LogisticRegression()
+        for m in [CC(lr), ACC(lr)]:
+            self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), False)
+        for m in [PCC(lr), PACC(lr)]:
+            self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), True)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
new file mode 100644
index 0000000..637f831
--- /dev/null
+++ b/quapy/tests/test_modsel.py
@@ -0,0 +1,77 @@
+import unittest
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+
+import quapy as qp
+from method.aggregative import PACC
+from model_selection import GridSearchQ
+from protocol import APP
+
+
+class ModselTestCase(unittest.TestCase):
+
+    def test_modsel(self):
+
+        q = PACC(LogisticRegression(random_state=1, max_iter=5000))
+
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        training, validation = data.training.split_stratified(0.7, random_state=1)
+        # test = data.test
+
+        param_grid = {'C': np.logspace(-3,3,7)}
+        app = APP(validation, sample_size=100, random_seed=1)
+        q = GridSearchQ(
+            q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
+        ).fit(training)
+        print('best params', q.best_params_)
+        print('best score', q.best_score_)
+
+        self.assertEqual(q.best_params_['C'], 10.0)
+        self.assertEqual(q.best_model().get_params()['C'], 10.0)
+
+    def test_modsel_parallel(self):
+
+        q = PACC(LogisticRegression(random_state=1, max_iter=5000))
+
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        training, validation = data.training.split_stratified(0.7, random_state=1)
+        # test = data.test
+
+        param_grid = {'C': np.logspace(-3,3,7)}
+        app = APP(validation, sample_size=100, random_seed=1)
+        q = GridSearchQ(
+            q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
+        ).fit(training)
+        print('best params', q.best_params_)
+        print('best score', q.best_score_)
+
+        self.assertEqual(q.best_params_['C'], 10.0)
+        self.assertEqual(q.best_model().get_params()['C'], 10.0)
+
+    def test_modsel_timeout(self):
+
+        class SlowLR(LogisticRegression):
+            def fit(self, X, y, sample_weight=None):
+                import time
+                time.sleep(10)
+                super(SlowLR, self).fit(X, y, sample_weight)
+
+        q = PACC(SlowLR())
+
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        training, validation = data.training.split_stratified(0.7, random_state=1)
+        # test = data.test
+
+        param_grid = {'C': np.logspace(-3,3,7)}
+        app = APP(validation, sample_size=100, random_seed=1)
+        q = GridSearchQ(
+            q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
+        )
+        with self.assertRaises(TimeoutError):
+            q.fit(training)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
new file mode 100644
index 0000000..bf92ce5
--- /dev/null
+++ b/quapy/tests/test_protocols.py
@@ -0,0 +1,139 @@
+import unittest
+import numpy as np
+from data import LabelledCollection
+from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
+
+
+def mock_labelled_collection(prefix=''):
+    y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250
+    X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)]
+    return LabelledCollection(X, y, classes_=sorted(np.unique(y)))
+
+
+def samples_to_str(protocol):
+    samples_str = ""
+    for sample in protocol():
+        samples_str += f'{sample.instances}\t{sample.labels}\t{sample.prevalence()}\n'
+    return samples_str
+
+
+class TestProtocols(unittest.TestCase):
+
+    def test_app_replicate(self):
+        data = mock_labelled_collection()
+        p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertEqual(samples1, samples2)
+
+    def test_app_not_replicate(self):
+        data = mock_labelled_collection()
+        p = APP(data, sample_size=5, n_prevalences=11)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertNotEqual(samples1, samples2)
+
+    def test_app_number(self):
+        data = mock_labelled_collection()
+        p = APP(data, sample_size=100, n_prevalences=10, repeats=1)
+
+        # surprisingly enough, for some n_prevalences the test fails, notwithstanding
+        # everything is correct. The problem is that in function APP.prevalence_grid()
+        # there is sometimes one rounding error that gets cumulated and
+        # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like)
+        # so these tuples are mistakenly removed... I have tried with np.close, and
+        # other workarounds, but eventually happens that there is some negative probability
+        # in the sampling function...
+
+        count = 0
+        for _ in p():
+            count+=1
+
+        self.assertEqual(count, p.total())
+
+    def test_npp_replicate(self):
+        data = mock_labelled_collection()
+        p = NPP(data, sample_size=5, repeats=5, random_seed=42)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertEqual(samples1, samples2)
+
+    def test_npp_not_replicate(self):
+        data = mock_labelled_collection()
+        p = NPP(data, sample_size=5, repeats=5)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertNotEqual(samples1, samples2)
+
+    def test_kraemer_replicate(self):
+        data = mock_labelled_collection()
+        p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertEqual(samples1, samples2)
+
+    def test_kraemer_not_replicate(self):
+        data = mock_labelled_collection()
+        p = USimplexPP(data, sample_size=5, repeats=10)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertNotEqual(samples1, samples2)
+
+    def test_covariate_shift_replicate(self):
+        dataA = mock_labelled_collection('domA')
+        dataB = mock_labelled_collection('domB')
+        p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertEqual(samples1, samples2)
+
+    def test_covariate_shift_not_replicate(self):
+        dataA = mock_labelled_collection('domA')
+        dataB = mock_labelled_collection('domB')
+        p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11)
+
+        samples1 = samples_to_str(p)
+        samples2 = samples_to_str(p)
+
+        self.assertNotEqual(samples1, samples2)
+
+    def test_no_seed_init(self):
+        class NoSeedInit(AbstractStochasticSeededProtocol):
+            def __init__(self):
+                self.data = mock_labelled_collection()
+
+            def samples_parameters(self):
+                # return a matrix containing sampling indexes in the rows
+                return np.random.randint(0, len(self.data), 10*10).reshape(10, 10)
+
+            def sample(self, params):
+                index = np.unique(params)
+                return self.data.sampling_from_index(index)
+
+        p = NoSeedInit()
+
+        # this should raise a ValueError, since the class is said to be AbstractStochasticSeededProtocol but the
+        # random_seed has never been passed to super(NoSeedInit, self).__init__(random_seed)
+        with self.assertRaises(ValueError):
+            for sample in p():
+                pass
+            print('done')
+
+
+
+if __name__ == '__main__':
+    unittest.main()