diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 17a6c39..fe752d8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,7 +28,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip setuptools wheel
-        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@main"
         python -m pip install -e .[bayes,tests]
     - name: Test with unittest
       run: python -m unittest
@@ -47,7 +47,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip setuptools wheel "jax[cpu]"
-        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@main"
         python -m pip install -e .[neural,docs]
     - name: Build documentation
       run: sphinx-build -M html docs/source docs/build
diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt
index 29d9402..a701186 100644
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@@ -1,10 +1,36 @@
-Change Log 0.1.10
+Change Log 0.2.0
 -----------------
 
+CLEAN TODO-FILE
+
+- Base code Refactor:
+    - Removing coupling between LabelledCollection and quantification methods; the fit interface changes:
+        def fit(data:LabelledCollection): -> def fit(X, y):
+    - Adding function "predict" (function "quantify" is still present as an alias, for the nostalgic)
+    - Aggregative methods's behavior in terms of fit_classifier and how to treat the val_split is now
+        indicated exclusively at construction time, and it is no longer possible to indicate it at fit time.
+        This is because, in v<=0.1.9, one could create a method (e.g., ACC) and then indicate:
+        my_acc.fit(tr_data, fit_classifier=False, val_split=val_data)
+        in which case the first argument is unused, and this was ambiguous with
+        my_acc.fit(the_data, fit_classifier=False)
+        in which case the_data is to be used for validation purposes. However, the val_split could be set as a fraction
+        indicating only part of the_data must be used for validation, and the rest wasted... it was certainly confusing.
+    - This change imposes a versioning constrain with qunfold, which now must be >= 0.1.6
+- EMQ has been modified, so that the representation function "classify" now only provides posterior
+    probabilities and, if required, these are recalibrated (e.g., by "bcts") during the aggregation function.
+    - A new parameter "on_calib_error" is passed to the constructor, which informs of the policy to follow
+        in case the abstention's calibration functions failed (which happens sometimes). Options include:
+            - 'raise': raises a RuntimeException (default)
+            - 'backup': reruns by silently avoiding calibration
+    - Parameter "recalib" has been renamed "calib"
 - Added aggregative bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or
     ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers.
     This method applies resampling only to the aggregation phase, thus avoiding to train many quantifiers, or
-    classify multiple times the instances of a sample. See the new example no. 15.
+    classify multiple times the instances of a sample. See:
+    - quapy/method/confidence.py (new)
+    - the new example no. 16.confidence_regions.py
+- BayesianCC moved to confidence.py, where methods having to do with confidence intervals belong.
+- Improved documentation of qp.plot module.
 
 
 Change Log 0.1.9
diff --git a/README.md b/README.md
index 839060d..d4be8e7 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ for facilitating the analysis and interpretation of the experimental results.
 
 ### Last updates:
 
-* Version 0.1.9 is released! major changes can be consulted [here](CHANGE_LOG.txt).
+* Version 0.2.0 is released! major changes can be consulted [here](CHANGE_LOG.txt).
 * The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/build/html/modules.html)
 
 ### Installation
@@ -46,15 +46,15 @@ of the test set.
 ```python
 import quapy as qp
 
-dataset = qp.datasets.fetch_UCIBinaryDataset("yeast")
-training, test = dataset.train_test
+training, test = qp.datasets.fetch_UCIBinaryDataset("yeast").train_test
 
 # create an "Adjusted Classify & Count" quantifier
 model = qp.method.aggregative.ACC()
-model.fit(training)
+Xtr, ytr = training.Xy
+model.fit(Xtr, ytr)
 
-estim_prevalence = model.quantify(test.X)
-true_prevalence  = test.prevalence()
+estim_prevalence = model.predict(test.X)
+true_prevalence = test.prevalence()
 
 error = qp.error.mae(true_prevalence, estim_prevalence)
 print(f'Mean Absolute Error (MAE)={error:.3f}')
@@ -79,7 +79,8 @@ quantification methods based on structured output learning, HDy, QuaNet, quantif
     * 32 UCI Machine Learning datasets.
     * 11 Twitter quantification-by-sentiment datasets.
     * 3 product reviews quantification-by-sentiment datasets. 
-    * 4 tasks from LeQua competition (_new in v0.1.7!_)
+    * 4 tasks from LeQua 2022 competition and 4 tasks from LeQua 2024 competition
+    * IFCB for Plancton quantification 
 * Native support for binary and single-label multiclass quantification scenarios.
 * Model selection functionality that minimizes quantification-oriented loss functions.
 * Visualization tools for analysing the experimental results.
@@ -102,17 +103,21 @@ In case you want to contribute improvements to quapy, please generate pull reque
 
 The [developer API documentation](https://hlt-isti.github.io/QuaPy/build/html/modules.html) is available [here](https://hlt-isti.github.io/QuaPy/build/html/index.html). 
 
-Check out our [Wiki](https://github.com/HLT-ISTI/QuaPy/wiki), in which many examples
+Check out the [Manuals](https://hlt-isti.github.io/QuaPy/manuals.html), in which many code examples
 are provided:
 
-* [Datasets](https://github.com/HLT-ISTI/QuaPy/wiki/Datasets)
-* [Evaluation](https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation)
-* [Protocols](https://github.com/HLT-ISTI/QuaPy/wiki/Protocols)
-* [Methods](https://github.com/HLT-ISTI/QuaPy/wiki/Methods)
-* [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization)
-* [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection)
-* [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting)
+* [Datasets](https://hlt-isti.github.io/QuaPy/manuals/datasets.html)
+* [Evaluation](https://hlt-isti.github.io/QuaPy/manuals/evaluation.html)
+* [Protocols](https://hlt-isti.github.io/QuaPy/manuals/protocols.html)
+* [Methods](https://hlt-isti.github.io/QuaPy/manuals/methods.html)
+* [SVMperf](https://hlt-isti.github.io/QuaPy/manuals/explicit-loss-minimization.html)
+* [Model Selection](https://hlt-isti.github.io/QuaPy/manuals/model-selection.html)
+* [Plotting](https://hlt-isti.github.io/QuaPy/manuals/plotting.html)
 
 ## Acknowledgments:
 
 <img src="docs/source/SoBigData.png" alt="SoBigData++" width="250"/>
+
+This work has been supported by the QuaDaSh project 
+_"Finanziato dall’Unione europea---Next Generation EU, 
+Missione 4 Componente 2 CUP B53D23026250001"_.
diff --git a/TODO.txt b/TODO.txt
index a90db5f..de40ed9 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,3 +1,54 @@
+Adapt examples; remaining: example 4-onwards
+not working: 15 (qunfold)
+
+Solve the warnings issue; right now there is a warning ignore in method/__init__.py:
+
+Add 'platt' to calib options in EMQ?
+
+Allow n_prevpoints in APP to be specified by a user-defined grid?
+
+Update READMEs, wiki, & examples for new fit-predict interface
+
+Add the fix suggested by Alexander:
+
+For a more general application, I would maybe first establish a per-class threshold value of plausible prevalence
+based on the number of actual positives and the required sample size; e.g., for sample_size=100 and actual
+positives [10, 100, 500] -> [0.1, 1.0, 1.0], meaning that class 0 can be sampled at most at 0.1 prevalence, while
+the others can be sampled up to 1. prevalence. Then, when a prevalence value is requested, e.g., [0.33, 0.33, 0.33],
+we may either clip each value and normalize (as you suggest for the extreme case, e.g., [0.1, 0.33, 0.33]/sum) or
+scale each value by per-class thresholds, i.e., [0.33*0.1, 0.33*1, 0.33*1]/sum.
+- This affects LabelledCollection
+- This functionality should be accessible via sampling protocols and evaluation functions
+
+Solve the pre-trained classifier issues. An example is the coptic-codes script I did, which needed a mock_lr to
+work for having access to classes_; think also the case in which the precomputed outputs are already generated
+as in the unifying problems code.
+
+Para quitar el labelledcollection de los métodos:
+
+- El follón viene por la semántica confusa de fit en agregativos, que recibe 3 parámetros:
+    - data: LabelledCollection, que puede ser:
+        - el training set si hay que entrenar el clasificador
+        - None si no hay que entregar el clasificador
+        - el validation, que entra en conflicto con val_split, si no hay que entrenar clasificador
+    - fit_classifier: dice si hay que entrenar el clasificador o no, y estos cambia la semántica de los otros
+    - val_split: que puede ser:
+        - un número: el número de kfcv, lo cual implica fit_classifier=True y data=todo el training set
+        - una fración en [0,1]: que indica la parte que usamos para validation; implica fit_classifier=True y data=train+val
+        - un labelled collection: el conjunto de validación específico; no implica fit_classifier=True ni False
+- La forma de quitar la dependencia de los métodos con LabelledCollection debería ser así:
+    - En el constructor se dice si el clasificador que se recibe por parámetro hay que entrenarlo o ya está entrenado;
+        es decir, hay un fit_classifier=True o False.
+        - fit_classifier=True:
+            - data en fit es todo el training incluyendo el validation y todo
+            - val_split:
+                - int: número de folds en kfcv
+                - proporción en [0,1]
+        - fit_classifier=False:
+
+
+
+- [TODO] document confidence in manuals
 - [TODO] Test the return_type="index" in protocols and finish the "distributing_samples.py" example
 - [TODO] Add EDy (an implementation is available at quantificationlib)
 - [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
diff --git a/docs/source/index.md b/docs/source/index.md
index accb758..ad9ac15 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -32,8 +32,8 @@ dataset = qp.datasets.fetch_twitter('semeval16')
 model = qp.method.aggregative.ACC(LogisticRegression())
 model.fit(dataset.training)
 
-estim_prevalence = model.quantify(dataset.test.instances)
-true_prevalence  = dataset.test.prevalence()
+estim_prevalence = model.predict(dataset.test.instances)
+true_prevalence = dataset.test.prevalence()
 
 error = qp.error.mae(true_prevalence, estim_prevalence)
 
diff --git a/docs/source/manuals/datasets.md b/docs/source/manuals/datasets.md
index 38d2bed..b7d8827 100644
--- a/docs/source/manuals/datasets.md
+++ b/docs/source/manuals/datasets.md
@@ -340,10 +340,10 @@ and a set of test samples (for evaluation). QuaPy returns this data as a Labelle
 (training) and two generation protocols (for validation and test samples), as follows:
 
 ```python
-training, val_generator, test_generator = fetch_lequa2022(task=task)
+training, val_generator, test_generator = qp.datasets.fetch_lequa2022(task=task)
 ```
 
-See the `lequa2022_experiments.py` in the examples folder for further details on how to
+See the `5a.lequa2022_experiments.py` in the examples folder for further details on how to
 carry out experiments using these datasets.  
 
 The datasets are downloaded only once, and stored for fast reuse.
@@ -365,6 +365,53 @@ Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
 A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
 ```
 
+## LeQua 2024 Datasets
+
+QuaPy also provides the datasets used for the [LeQua 2024 competition](https://lequa2024.github.io/). 
+In brief, there are 4 tasks:
+* T1: binary quantification (by sentiment)
+* T2: multiclass quantification (28 classes, merchandise products)
+* T3: ordinal quantification (5-stars sentiment ratings)
+* T4: binary sentiment quantification under a combination of covariate shift and prior shift
+
+In all cases, the covariate space has 256 dimensions (extracted using the `ELECTRA-Small` model).
+
+Every task consists of a training set, a set of validation samples (for model selection)
+and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection
+(training bags) and sampling generation protocols (for validation and test bags). 
+T3 also offers the possibility to obtain a series of training bags (in form of a 
+sampling generation protocol) instead of one single training bag. Use it as follows:
+
+```python
+training, val_generator, test_generator = qp.datasets.fetch_lequa2024(task=task)
+```
+
+See the `5b.lequa2024_experiments.py` in the examples folder for further details on how to
+carry out experiments using these datasets.  
+
+The datasets are downloaded only once, and stored for fast reuse.
+
+Some statistics are summarized below:
+
+| Dataset | classes | train size  | validation samples | test samples | docs by sample |   type   |
+|---------|:-------:|:-----------:|:------------------:|:------------:|:--------------:|:--------:| 
+| T1      |    2    |    5000     |        1000        |     5000     |      250       |  vector  | 
+| T2      |   28    |    20000    |        1000        |     5000     |      1000      |  vector  |
+| T3      |    5    | 100 samples |        1000        |     5000     |      200       |   vector   |
+| T4      |    2    |    5000     |        1000        |     5000     |      250       |   vector   |
+
+For further details on the datasets or the competition, we refer to 
+[the official site](https://lequa2024.github.io/data/) and
+[the overview paper](http://nmis.isti.cnr.it/sebastiani/Publications/LQ2024.pdf).
+
+```
+Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
+An Overview of LeQua 2024, the 2nd International Data Challenge on Learning to Quantify,
+Proceedings of the 4th International Workshop on Learning to Quantify (LQ 2024), 
+ECML-PKDD 2024, Vilnius, Lithuania.
+```
+
+
 ## IFCB Plankton dataset
 
 IFCB is a dataset of plankton species in water samples hosted in `Zenodo <https://zenodo.org/records/10036244>`_.
@@ -402,12 +449,20 @@ train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sampl
 # ... train and evaluation
 ```
 
+See also [Automatic plankton quantification using deep features
+P González, A Castaño, EE Peacock, J Díez, JJ Del Coz, HM Sosik
+Journal of Plankton Research 41 (4), 449-463](https://par.nsf.gov/servlets/purl/10172325).
+
 
 
 ## Adding Custom Datasets
 
+It is straightforward to import your own datasets into QuaPy. 
+I what follows, there are some code snippets for doing so; see also the example
+[3.custom_collection.py](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/3.custom_collection.py).
+
 QuaPy provides data loaders for simple formats dealing with 
-text, following the format:
+text; for example, use `qp.data.reader.from_text` for the following the format:
 
 ```
 class-id \t first document's pre-processed text \n
@@ -415,13 +470,16 @@ class-id \t second document's pre-processed text \n
 ...
 ```
 
-and sparse representations of the form:
+or `qp.data.reader.from_sparse` for sparse representations of the form:
 
 ```
 {-1, 0, or +1} col(int):val(float) col(int):val(float) ... \n
 ...
 ```
 
+both functions return a tuple `X, y` containing a list of strings and the corresponding 
+labels, respectively.
+
 The code in charge in loading a LabelledCollection is:
 
 ```python
@@ -430,12 +488,13 @@ def load(cls, path:str, loader_func:callable):
     return LabelledCollection(*loader_func(path))
 ```
 
-indicating that any _loader_func_ (e.g., a user-defined one) which 
+indicating that any `loader_func` (e.g., `from_text`, `from_sparse`, `from_csv`, or a user-defined one) which 
 returns valid arguments for initializing a _LabelledCollection_ object will allow
-to load any collection. In particular, the _LabelledCollection_ receives as 
-arguments the instances (as an iterable) and the labels (as an iterable) and,
-additionally, the number of classes can be specified (it would otherwise be
-inferred from the labels, but that requires at least one positive example for
+to load any collection. More specifically, the _LabelledCollection_ receives as 
+arguments the _instances_ (iterable) and the _labels_ (iterable) and,
+optionally, the number of classes (it would be
+inferred from the labels if not indicated, but this requires at least one 
+positive example for
 all classes to be present in the collection).
 
 The same _loader_func_ can be passed to a Dataset, along with two 
@@ -448,20 +507,23 @@ import quapy as qp
 train_path = '../my_data/train.dat'
 test_path = '../my_data/test.dat'
 
-def my_custom_loader(path):
+def my_custom_loader(path, **custom_kwargs):
     with open(path, 'rb') as fin:
         ...
     return instances, labels
 
-data = qp.data.Dataset.load(train_path, test_path, my_custom_loader)
+data = qp.data.Dataset.load(train_path, test_path, my_custom_loader, **custom_kwargs)
 ```
 
 ### Data Processing
 
-QuaPy implements a number of preprocessing functions in the package _qp.data.preprocessing_, including:
+QuaPy implements a number of preprocessing functions in the package `qp.data.preprocessing`, including:
 
 * _text2tfidf_: tfidf vectorization 
 * _reduce_columns_: reducing the number of columns based on term frequency
 * _standardize_: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so
 that the column values have zero mean and unit variance).
-* _index_: transforms textual tokens into lists of numeric ids) 
+* _index_: transforms textual tokens into lists of numeric ids
+
+These functions are applied to `Dataset` objects, and offer the possibility to apply the transformation
+inline (thus modifying the original dataset), or to return a modified copy.
\ No newline at end of file
diff --git a/docs/source/manuals/evaluation.md b/docs/source/manuals/evaluation.md
index e5404a3..aba7068 100644
--- a/docs/source/manuals/evaluation.md
+++ b/docs/source/manuals/evaluation.md
@@ -46,18 +46,18 @@ e.g.:
 
 ```python
 qp.environ['SAMPLE_SIZE'] = 100  # once for all
-true_prev = np.asarray([0.5, 0.3, 0.2])  # let's assume 3 classes
-estim_prev = np.asarray([0.1, 0.3, 0.6])
+true_prev = [0.5, 0.3, 0.2]  # let's assume 3 classes
+estim_prev = [0.1, 0.3, 0.6]
 error = qp.error.mrae(true_prev, estim_prev)
 print(f'mrae({true_prev}, {estim_prev}) = {error:.3f}')
 ```
 
 will print:
 ```
-mrae([0.500, 0.300, 0.200], [0.100, 0.300, 0.600]) = 0.914
+mrae([0.5, 0.3, 0.2], [0.1, 0.3, 0.6]) = 0.914
 ```
 
-Finally, it is possible to instantiate QuaPy's quantification
+It is also possible to instantiate QuaPy's quantification
 error functions from strings using, e.g.:
 
 ```python
@@ -85,7 +85,7 @@ print(f'MAE = {mae:.4f}')
 ```
 
 It is often desirable to evaluate our system using more than one
-single evaluatio measure. In this case, it is convenient to generate
+single evaluation measure. In this case, it is convenient to generate
 a _report_. A report in QuaPy is a dataframe accounting for all the
 true prevalence values with their corresponding prevalence values
 as estimated by the quantifier, along with the error each has given
@@ -104,7 +104,7 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev)
 print(report)
 
 print('Averaged values:')
-print(report.mean())
+print(report.mean(numeric_only=True))
 ```
 
 This will produce an output like:
@@ -141,11 +141,14 @@ true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, protocol=prot)
 
 All the evaluation functions implement specific optimizations for speeding-up 
 the evaluation of aggregative quantifiers (i.e., of instances of _AggregativeQuantifier_).
+
 The optimization comes down to generating classification predictions (either crisp or soft) 
 only once for the entire test set, and then applying the sampling procedure to the
 predictions, instead of generating samples of instances and then computing the 
 classification predictions every time. This is only possible when the protocol
-is an instance of _OnLabelledCollectionProtocol_. The optimization is only 
+is an instance of _OnLabelledCollectionProtocol_. 
+
+The optimization is only 
 carried out when the number of classification predictions thus generated would be
 smaller than the number of predictions required for the entire protocol; e.g., 
 if the original dataset contains 1M instances, but the protocol is such that it would
@@ -156,4 +159,4 @@ precompute all the predictions irrespectively of the number of instances and num
 Finally, this can be deactivated by setting _aggr_speedup=False_. Note that this optimization
 is not only applied for the final evaluation, but also for the internal evaluations carried
 out during _model selection_. Since these are typically many, the heuristic can help reduce the
-execution time a lot.
\ No newline at end of file
+execution time significatively.
\ No newline at end of file
diff --git a/docs/source/manuals/methods.md b/docs/source/manuals/methods.md
index 1a9a2dc..47b7cad 100644
--- a/docs/source/manuals/methods.md
+++ b/docs/source/manuals/methods.md
@@ -1,7 +1,7 @@
 # Quantification Methods
 
 Quantification methods can be categorized as belonging to
-`aggregative` and `non-aggregative` groups. 
+`aggregative`, `non-aggregative`, and `meta-learning` groups. 
 Most methods included in QuaPy at the moment are of type `aggregative`
 (though we plan to add many more methods in the near future), i.e.,
 are methods characterized by the fact that
@@ -12,21 +12,17 @@ Any quantifier in QuaPy shoud extend the class `BaseQuantifier`,
 and implement some abstract methods:
 ```python
     @abstractmethod
-    def fit(self, data: LabelledCollection): ...
+    def fit(self, X, y): ...
 
     @abstractmethod
-    def quantify(self, instances): ...
+    def predict(self, X): ...
 ```
 The meaning of those functions should be familiar to those
 used to work with scikit-learn since the class structure of QuaPy
 is directly inspired by scikit-learn's _Estimators_. Functions
-`fit` and `quantify` are used to train the model and to provide
-class estimations (the reason why
-scikit-learn' structure has not been adopted _as is_ in QuaPy responds to 
-the fact that scikit-learn's `predict` function is expected to return
-one output for each input element --e.g., a predicted label for each
-instance in a sample-- while in quantification the output for a sample
-is one single array of class prevalences).
+`fit` and `predict` (for which there is an alias `quantify`) 
+are used to train the model and to provide
+class estimations.
 Quantifiers also extend from scikit-learn's `BaseEstimator`, in order
 to simplify the use of `set_params` and `get_params` used in 
 [model selection](./model-selection).
@@ -40,21 +36,26 @@ The methods that any `aggregative` quantifier must implement are:
 
 ```python
     @abstractmethod
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
 
     @abstractmethod
-    def aggregate(self, classif_predictions:np.ndarray): ...
+    def aggregate(self, classif_predictions): ...
 ```
 
-These two functions replace the `fit` and `quantify` methods, since those
-come with default implementations. The `fit` function is provided and amounts to: 
+The argument `classif_predictions` is whatever the method `classify` returns. 
+QuaPy comes with default implementations that cover most common cases, but you can
+override `classify` in case your method requires further or different information to work.
+
+These two functions replace the `fit` and `predict` methods, which
+come with default implementations. For instance, the `fit` function is 
+provided and amounts to: 
 
 ```python
-def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
-    self._check_init_parameters()
-    classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
-    self.aggregation_fit(classif_predictions, data)
-    return self
+    def fit(self, X, y):
+        self._check_init_parameters()
+        classif_predictions, labels = self.classifier_fit_predict(X, y)
+        self.aggregation_fit(classif_predictions, labels)
+        return self
 ```
 
 Note that this function fits the classifier, and generates the predictions. This is assumed
@@ -72,11 +73,11 @@ overriden (if needed) and allows the method to quickly raise any exception based
 found in the `__init__` arguments, thus avoiding to break after training the classifier and generating
 predictions.
 
-Similarly, the function `quantify` is provided, and amounts to:
+Similarly, the function `predict` (alias `quantify`) is provided, and amounts to:
 
 ```python
-def quantify(self, instances):
-    classif_predictions = self.classify(instances)
+def predict(self, X):
+    classif_predictions = self.classify(X)
     return self.aggregate(classif_predictions)
 ```
 
@@ -84,12 +85,14 @@ in which only the function `aggregate` is required to be overriden in most cases
 
 Aggregative quantifiers are expected to maintain a classifier (which is
 accessed through the `@property` `classifier`). This classifier is
-given as input to the quantifier, and can be already fit
-on external data (in which case, the `fit_learner` argument should
-be set to False), or be fit by the quantifier's fit (default).
+given as input to the quantifier, and will be trained by the quantifier's fit (default).
+Alternatively, the classifier can be already fit on external data; in this case, the `fit_learner` 
+argument in the `__init__` should be set to False (see [4.using_pretrained_classifier.py](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/4.using_pretrained_classifier.py)
+for a full code example).
 
-The above patterns (in training: fit the classifier, then fit the aggregation; 
-in test: classify, then aggregate) allows QuaPy to optimize many internal procedures.
+The above patterns (in training: (i) fit the classifier, then (ii) fit the aggregation; 
+in test: (i) classify, then (ii) aggregate) allows QuaPy to optimize many internal procedures, 
+on the grounds that steps (i) are slower than steps (ii). 
 In particular, the model selection routing takes advantage of this two-step process
 and generates classifiers only for the valid combinations of hyperparameters of the 
 classifier, and then _clones_ these classifiers and explores the combinations
@@ -124,6 +127,7 @@ import quapy.functional as F
 from sklearn.svm import LinearSVC
 
 training, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test
+Xtr, ytr = training.Xy
 
 # instantiate a classifier learner, in this case a SVM
 svm = LinearSVC()
@@ -131,8 +135,8 @@ svm = LinearSVC()
 # instantiate a Classify & Count with the SVM
 # (an alias is available in qp.method.aggregative.ClassifyAndCount)
 model = qp.method.aggregative.CC(svm)
-model.fit(training)
-estim_prevalence = model.quantify(test.instances)
+model.fit(Xtr, ytr)
+estim_prevalence = model.predict(test.instances)
 ```
 
 The same code could be used to instantiate an ACC, by simply replacing
@@ -153,26 +157,14 @@ predictions. This parameters can also be set with an integer,
 indicating that the parameters should be estimated by means of
 _k_-fold cross-validation, for which the integer indicates the
 number _k_ of folds (the default value is 5). Finally, `val_split` can be set to a 
-specific held-out validation set (i.e., an instance of `LabelledCollection`).
-
-The specification of `val_split` can be
-postponed to the invokation of the fit method (if `val_split` was also
-set in the constructor, the one specified at fit time would prevail), 
-e.g.:
-
-```python
-model = qp.method.aggregative.ACC(svm)
-# perform 5-fold cross validation for estimating ACC's parameters
-# (overrides the default val_split=0.4 in the constructor)
-model.fit(training, val_split=5)
-```
+specific held-out validation set (i.e., an tuple `(X,y)`).
 
 The following code illustrates the case in which PCC is used:
 
 ```python
 model = qp.method.aggregative.PCC(svm)
-model.fit(training)
-estim_prevalence = model.quantify(test.instances)
+model.fit(Xtr, ytr)
+estim_prevalence = model.predict(Xte)
 print('classifier:', model.classifier)
 ```
 In this case, QuaPy will print:
@@ -185,11 +177,11 @@ is not a probabilistic classifier (i.e., it does not implement the
 `predict_proba` method) and so, the classifier will be converted to
 a probabilistic one through [calibration](https://scikit-learn.org/stable/modules/calibration.html).
 As a result, the classifier that is printed in the second line points
-to a `CalibratedClassifier` instance. Note that calibration can only
-be applied to hard classifiers when `fit_learner=True`; an exception 
+to a `CalibratedClassifierCV` instance. Note that calibration can only
+be applied to hard classifiers if `fit_learner=True`; an exception 
 will be raised otherwise.
 
-Lastly, everything we said aboud ACC and PCC
+Lastly, everything we said about ACC and PCC
 applies to PACC as well.
 
 _New in v0.1.9_: quantifiers ACC and PACC now have three additional arguments: `method`, `solver` and `norm`:
@@ -259,22 +251,28 @@ An example of use can be found below:
 import quapy as qp
 from sklearn.linear_model import LogisticRegression
 
-dataset = qp.datasets.fetch_twitter('hcr', pickle=True)
+train, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test
 
 model = qp.method.aggregative.EMQ(LogisticRegression())
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
+model.fit(*train.Xy)
+estim_prevalence = model.predict(test.X)
 ```
 
-_New in v0.1.7_: EMQ now accepts two new parameters in the construction method, namely
-`exact_train_prev` which allows to use the true training prevalence as the departing
-prevalence estimation (default behaviour), or instead an approximation of it as 
+EMQ accepts additional parameters in the construction method:
+* `exact_train_prev`: set to True for using the true training prevalence as the departing
+prevalence estimation (default behaviour), or to False for using an approximation of it as
 suggested by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html) 
-(by setting `exact_train_prev=False`).
-The other parameter is `recalib` which allows to indicate a calibration method, among those
+* `calib`: allows to indicate a calibration method, among those
 proposed by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html),
-including the Bias-Corrected Temperature Scaling, Vector Scaling, etc.
-See the API documentation for further details. 
+including the Bias-Corrected Temperature Scaling 
+(`bcts`), Vector Scaling (`bcts`), No-Bias Temperature Scaling (`nbvs`), 
+or Temperature Scaling (`ts`); default is `None` (no calibration).
+* `on_calib_error`: indicates the policy to follow in case the calibrator fails at runtime.
+        Options include `raise` (default), in which case a RuntimeException is raised; and `backup`, in which
+        case the calibrator is silently skipped.
+
+You can use the class method `EMQ_BCTS` to effortlessly instantiate EMQ with the best performing
+heuristics found by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html). See the API documentation for further details. 
 
 
 ### Hellinger Distance y (HDy)
@@ -289,16 +287,16 @@ This method works with a probabilistic classifier (hard classifiers
 can be used as well and will be calibrated) and requires a validation
 set to estimate parameter for the mixture model. Just like 
 ACC and PACC, this quantifier receives a `val_split` argument
-in the constructor (or in the fit method, in which case the previous
-value is overridden) that can either be a float indicating the proportion
+in the constructor that can either be a float indicating the proportion
 of training data to be taken as the validation set (in a random
-stratified split), or a validation set (i.e., an instance of 
-`LabelledCollection`) itself. 
+stratified split), or the validation set itself (i.e., an tuple
+`(X,y)`). 
 
 HDy was proposed as a binary classifier and the implementation
 provided in QuaPy accepts only binary datasets. 
  
-The following code shows an example of use:   
+The following code shows an example of use:
+
 ```python
 import quapy as qp
 from sklearn.linear_model import LogisticRegression
@@ -308,11 +306,11 @@ dataset = qp.datasets.fetch_reviews('hp', pickle=True)
 qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
 
 model = qp.method.aggregative.HDy(LogisticRegression())
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
+model.fit(*dataset.training.Xy)
+estim_prevalence = model.predict(dataset.test.X)
 ```
 
-_New in v0.1.7:_ QuaPy now provides an implementation of the generalized
+QuaPy also provides an implementation of the generalized
 "Distribution Matching" approaches for multiclass, inspired by the framework
 of [Firat (2016)](https://arxiv.org/abs/1606.00868). One can instantiate
 a variant of HDy for multiclass quantification as follows:
@@ -321,17 +319,22 @@ a variant of HDy for multiclass quantification as follows:
 mutliclassHDy = qp.method.aggregative.DMy(classifier=LogisticRegression(), divergence='HD', cdf=False)
 ``` 
 
-_New in v0.1.7:_ QuaPy now provides an implementation of the "DyS"
+QuaPy also provides an implementation of the "DyS"
 framework proposed by [Maletzke et al (2020)](https://ojs.aaai.org/index.php/AAAI/article/view/4376)
 and the "SMM" method proposed by [Hassan et al (2019)](https://ieeexplore.ieee.org/document/9260028)
 (thanks to _Pablo González_ for the contributions!)
 
 ### Threshold Optimization methods
 
-_New in v0.1.7:_ QuaPy now implements Forman's threshold optimization methods;
+QuaPy implements Forman's threshold optimization methods;
 see, e.g., [(Forman 2006)](https://dl.acm.org/doi/abs/10.1145/1150402.1150423) 
 and [(Forman 2008)](https://link.springer.com/article/10.1007/s10618-008-0097-y).
-These include: T50, MAX, X, Median Sweep (MS), and its variant MS2.
+These include: `T50`, `MAX`, `X`, Median Sweep (`MS`), and its variant `MS2`.
+
+These methods are binary-only and implement different heuristics for 
+improving the stability of the denominator of the ACC adjustment (`tpr-fpr`).
+The methods are called "threshold" since said heuristics have to do
+with different choices of the underlying classifier's threshold.
 
 ### Explicit Loss Minimization
 
@@ -411,19 +414,21 @@ qp.environ['SVMPERF_HOME'] = '../svm_perf_quantification'
 
 model = newOneVsAll(SVMQ(), n_jobs=-1)  # run them on parallel
 model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
+estim_prevalence = model.predict(dataset.test.instances)
 ```
 
-Check the examples on [explicit_loss_minimization](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/5.explicit_loss_minimization.py)
+Check the examples on [explicit loss minimization](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/17.explicit_loss_minimization.py)
 and on [one versus all quantification](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/10.one_vs_all.py) for more details.
+**Note** that the _one versus all_ approach is considered inappropriate under prior probability shift, though. 
 
 ### Kernel Density Estimation methods (KDEy)
 
-_New in v0.1.8_: QuaPy now provides implementations for the three variants
+QuaPy provides implementations for the three variants
 of KDE-based methods proposed in 
-_[Moreo, A., González, P. and del Coz, J.J., 2023. 
+_[Moreo, A., González, P. and del Coz, J.J.. 
 Kernel Density Estimation for Multiclass Quantification. 
-arXiv preprint arXiv:2401.00490](https://arxiv.org/abs/2401.00490)_. 
+Machine Learning. Vol 114 (92), 2025](https://link.springer.com/article/10.1007/s10994-024-06726-5)_
+(a [preprint](https://arxiv.org/abs/2401.00490) is available online). 
 The variants differ in the divergence metric to be minimized:
 
 - KDEy-HD: minimizes the (squared) Hellinger Distance and solves the problem via a Monte Carlo approach
@@ -434,30 +439,42 @@ These methods are specifically devised for multiclass problems (although they ca
 binary problems too). 
 
 All KDE-based methods depend on the hyperparameter `bandwidth` of the kernel. Typical values
-that can be explored in model selection range in [0.01, 0.25]. The methods' performance
-vary smoothing with smooth variations of this hyperparameter.
+that can be explored in model selection range in [0.01, 0.25]. Previous experiments reveal the methods' performance
+varies smoothly at small variations of this hyperparameter.
 
 
 ## Composable Methods
 
-The [](quapy.method.composable) module allows the composition of quantification methods from loss functions and feature transformations. Any composed method solves a linear system of equations by minimizing the loss after transforming the data. Methods of this kind include ACC, PACC, HDx, HDy, and many other well-known methods, as well as an unlimited number of re-combinations of their building blocks.
+The `quapy.method.composable` module integrates [qunfold](https://github.com/mirkobunse/qunfold) allows the composition
+of quantification methods from loss functions and feature transformations (thanks to Mirko Bunse for the integration!). 
+
+Any composed method solves a linear system of equations by minimizing the loss after transforming the data. Methods of this kind include ACC, PACC, HDx, HDy, and many other well-known methods, as well as an unlimited number of re-combinations of their building blocks.
 
 ### Installation
 
 ```sh
 pip install --upgrade pip setuptools wheel
 pip install "jax[cpu]"
-pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5"
 ```
 
+**Note:** since version 0.2.0, QuaPy is only compatible with qunfold >=0.1.5. 
+
 ### Basics
 
 The composition of a method is implemented through the [](quapy.method.composable.ComposableQuantifier) class. Its documentation also features an example to get you started in composing your own methods.
 
 ```python
+from quapy.method.composable import (
+    ComposableQuantifier,
+    TikhonovRegularized,
+    LeastSquaresLoss,
+    ClassRepresentation,
+)
+
 ComposableQuantifier( # ordinal ACC, as proposed by Bunse et al., 2022
-  TikhonovRegularized(LeastSquaresLoss(), 0.01),
-  ClassTransformer(RandomForestClassifier(oob_score=True))
+    TikhonovRegularized(LeastSquaresLoss(), 0.01),
+    ClassRepresentation(RandomForestClassifier(oob_score=True))
 )
 ```
 
@@ -484,16 +501,16 @@ You can use the [](quapy.method.composable.CombinedLoss) to create arbitrary, we
 
 ### Feature transformations
 
-- [](quapy.method.composable.ClassTransformer)
-- [](quapy.method.composable.DistanceTransformer)
-- [](quapy.method.composable.HistogramTransformer)
-- [](quapy.method.composable.EnergyKernelTransformer)
-- [](quapy.method.composable.GaussianKernelTransformer)
-- [](quapy.method.composable.LaplacianKernelTransformer)
-- [](quapy.method.composable.GaussianRFFKernelTransformer)
+- [](quapy.method.composable.ClassRepresentation)
+- [](quapy.method.composable.DistanceRepresentation)
+- [](quapy.method.composable.HistogramRepresentation)
+- [](quapy.method.composable.EnergyKernelRepresentation)
+- [](quapy.method.composable.GaussianKernelRepresentation)
+- [](quapy.method.composable.LaplacianKernelRepresentation)
+- [](quapy.method.composable.GaussianRFFKernelRepresentation)
 
 ```{hint}
-The [](quapy.method.composable.ClassTransformer) requires the classifier to have a property `oob_score==True` and to produce a property `oob_decision_function` during fitting. In [scikit-learn](https://scikit-learn.org/), this requirement is fulfilled by any bagging classifier, such as random forests. Any other classifier needs to be cross-validated through the [](quapy.method.composable.CVClassifier).
+The [](quapy.method.composable.ClassRepresentation) requires the classifier to have a property `oob_score==True` and to produce a property `oob_decision_function` during fitting. In [scikit-learn](https://scikit-learn.org/), this requirement is fulfilled by any bagging classifier, such as random forests. Any other classifier needs to be cross-validated through the [](quapy.method.composable.CVClassifier).
 ```
 
 
@@ -528,10 +545,11 @@ from quapy.method.meta import Ensemble
 from sklearn.linear_model import LogisticRegression
 
 dataset = qp.datasets.fetch_UCIBinaryDataset('haberman')
+train, test = dataset.train_test
 
 model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1)
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
+model.fit(*train.Xy)
+estim_prevalence = model.predict(test.X)
 ```
 
 Other aggregation policies implemented in QuaPy include:
@@ -578,13 +596,13 @@ learner = NeuralClassifierTrainer(cnn, device='cuda')
 
 # train QuaNet
 model = QuaNet(learner, device='cuda')
-model.fit(dataset.training)
-estim_prevalence = model.quantify(dataset.test.instances)
+model.fit(*dataset.training.Xy)
+estim_prevalence = model.predict(dataset.test.X)
 ```
 
 ## Confidence Regions for Class Prevalence Estimation
 
-_(New in v0.1.10!)_ Some quantification methods go beyond providing a single point estimate of class prevalence values and also produce confidence regions, which characterize the uncertainty around the point estimate. In QuaPy, two such methods are currently implemented:
+_(New in v0.2.0!)_ Some quantification methods go beyond providing a single point estimate of class prevalence values and also produce confidence regions, which characterize the uncertainty around the point estimate. In QuaPy, two such methods are currently implemented:
 
 * Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. Key features of this method include:
 
@@ -592,9 +610,9 @@ _(New in v0.1.10!)_ Some quantification methods go beyond providing a single poi
 During training, bootstrap repetitions are performed only after training the classifier once. These repetitions are used to train multiple aggregation functions.
 During inference, bootstrap is applied over pre-classified test instances.
   * General Applicability: Aggregative Bootstrap can be applied to any aggregative quantifier.
-  For further information, check the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples) provided.
+  For further information, check the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples/16.confidence_regions.py) provided.
 
-* BayesianCC: is a Bayesian variant of the Adjusted Classify & Count (ACC) quantifier (see more details in [Aggregative Quantifiers](#bayesiancc)).
+* BayesianCC: is a Bayesian variant of the Adjusted Classify & Count (ACC) quantifier; see more details in the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples/14.bayesian_quantification.py) provided.
 
 Confidence regions are constructed around a point estimate, which is typically computed as the mean value of a set of samples.
 The confidence region can be instantiated in three ways:
diff --git a/docs/source/manuals/model-selection.md b/docs/source/manuals/model-selection.md
index 097f902..6470ebf 100644
--- a/docs/source/manuals/model-selection.md
+++ b/docs/source/manuals/model-selection.md
@@ -87,7 +87,7 @@ model = qp.model_selection.GridSearchQ(
     error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
     refit=True,  # retrain on the whole labelled set once done
     verbose=True  # show information as the process goes on
-).fit(training)
+).fit(*training.Xy)
 
 print(f'model selection ended: best hyper-parameters={model.best_params_}')
 model = model.best_model_
@@ -133,7 +133,7 @@ learner = GridSearchCV(
     LogisticRegression(),
     param_grid={'C': np.logspace(-4, 5, 10), 'class_weight': ['balanced', None]},
     cv=5)
-model = DistributionMatching(learner).fit(dataset.train)
+model = DistributionMatching(learner).fit(*dataset.train.Xy)
 ```
 
 However, this is conceptually flawed, since the model should be
diff --git a/docs/source/manuals/plotting.md b/docs/source/manuals/plotting.md
index ec080da..67f9f16 100644
--- a/docs/source/manuals/plotting.md
+++ b/docs/source/manuals/plotting.md
@@ -2,6 +2,9 @@
 
 The module _qp.plot_ implements some basic plotting functions
 that can help analyse the performance of a quantification method.
+See the provided 
+[code example](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/13.plotting.py) 
+for a full example. 
 
 All plotting functions receive as inputs the outcomes of 
 some experiments and include, for each experiment, 
@@ -77,7 +80,7 @@ def gen_data():
     method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
 
     for method_name, model in models():
-        model.fit(train)
+        model.fit(*train.Xy)
         true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
 
         method_names.append(method_name)
@@ -171,7 +174,7 @@ def gen_data():
         training_size = 5000
         # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
         train_sample = train.sampling(training_size, 1-training_prevalence)
-        model.fit(train_sample)
+        model.fit(*train_sample.Xy)
         true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
         method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
         method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
diff --git a/docs/source/manuals/protocols.md b/docs/source/manuals/protocols.md
index 1d6193e..17bc41a 100644
--- a/docs/source/manuals/protocols.md
+++ b/docs/source/manuals/protocols.md
@@ -1,7 +1,5 @@
 # Protocols
 
-_New in v0.1.7!_
-
 Quantification methods are expected to behave robustly in the presence of 
 shift. For this reason, quantification methods need to be confronted with
 samples exhibiting widely varying amounts of shift. 
@@ -106,15 +104,16 @@ train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
 
 # model selection
 train, val = train.split_stratified(train_prop=0.75)
+Xtr, ytr = train.Xy
 quantifier = qp.model_selection.GridSearchQ(
     quantifier, 
     param_grid={'classifier__C': np.logspace(-2, 2, 5)}, 
     protocol=APP(val)  # <- this is the protocol we use for generating validation samples
-).fit(train)
+).fit(Xtr, ytr)
 
 # default values are n_prevalences=21, repeats=10, random_state=0; this is equialent to:
 # val_app = APP(val, n_prevalences=21, repeats=10, random_state=0)
-# quantifier = GridSearchQ(quantifier, param_grid, protocol=val_app).fit(train)
+# quantifier = GridSearchQ(quantifier, param_grid, protocol=val_app).fit(Xtr, ytr)
 
 # evaluation with APP
 mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae')
diff --git a/examples/0.basics.py b/examples/0.basics.py
index aee7b5d..a891475 100644
--- a/examples/0.basics.py
+++ b/examples/0.basics.py
@@ -6,6 +6,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 
 import quapy as qp
+from quapy.method.aggregative import PACC
 
 # let's fetch some dataset to run one experiment
 # datasets are available in the "qp.data.datasets" module (there is a shortcut in qp.datasets)
@@ -34,14 +35,14 @@ print(f'training prevalence = {F.strprev(train.prevalence())}')
 
 # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
 classifier = LogisticRegression()
-pacc = qp.method.aggregative.PACC(classifier)
+pacc = PACC(classifier)
 
 print(f'training {pacc}')
-pacc.fit(train)
+pacc.fit(X, y)
 
 # let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
 X_test = test.X
-estim_prevalence = pacc.quantify(X_test)
+estim_prevalence = pacc.predict(X_test)
 
 print(f'estimated test prevalence = {F.strprev(estim_prevalence)}')
 print(f'true test prevalence = {F.strprev(test.prevalence())}')
diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py
index 61b7087..6c96671 100644
--- a/examples/1.model_selection.py
+++ b/examples/1.model_selection.py
@@ -12,15 +12,24 @@ In this example, we show how to perform model selection on a DistributionMatchin
 model = DMy()
 
 qp.environ['SAMPLE_SIZE'] = 100
+qp.environ['N_JOBS'] = -1
 
 print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
-      f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
+      f'to increase/decrease the number of jobs use:\n'
+      f'> N_JOBS=-1 python3 1.model_selection.py\n'
       f'alternatively, you can set this variable within the script as:\n'
       f'import quapy as qp\n'
       f'qp.environ["N_JOBS"]=-1')
 
 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
 
+# evaluation in terms of MAE with default hyperparameters
+Xtr, ytr = training.Xy
+model.fit(Xtr, ytr)
+mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
+print(f'MAE (non optimized)={mae_score:.5f}')
+
+
 with qp.util.temp_seed(0):
 
     # The model will be returned by the fit method of GridSearchQ.
@@ -50,6 +59,7 @@ with qp.util.temp_seed(0):
 
     tinit = time()
 
+    Xtr, ytr = training.Xy
     model = qp.model_selection.GridSearchQ(
         model=model,
         param_grid=param_grid,
@@ -58,7 +68,7 @@ with qp.util.temp_seed(0):
         refit=False,   # retrain on the whole labelled set once done
         # raise_errors=False,
         verbose=True  # show information as the process goes on
-    ).fit(training)
+    ).fit(Xtr, ytr)
 
 tend = time()
 
diff --git a/examples/10.one_vs_all.py b/examples/10.one_vs_all.py
index 3f5c4ac..ca70662 100644
--- a/examples/10.one_vs_all.py
+++ b/examples/10.one_vs_all.py
@@ -9,6 +9,11 @@ import numpy as np
 """
 In this example, we will create a quantifier for tweet sentiment analysis considering three classes: negative, neutral,
 and positive. We will use a one-vs-all approach using a binary quantifier for demonstration purposes.
+
+Caveat: the one-vs-all approach is deemed inadequate under prior probability shift conditions. The reasons
+are discussed in:
+Donyavi, Z., Serapio, A., & Batista, G. (2023). MC-SQ: A highly accurate ensemble for multi-class quantifi-
+cation. In: Proceedings of the 2023 SIAM International Conference on Data Mining (SDM), SIAM, pp. 622–630
 """
 
 qp.environ['SAMPLE_SIZE'] = 100
@@ -40,11 +45,11 @@ param_grid = {
 }
 print('starting model selection')
 model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False)
-quantifier = model_selection.fit(train_modsel).best_model()
+quantifier = model_selection.fit(*train_modsel.Xy).best_model()
 
 print('training on the whole training set')
 train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test
-quantifier.fit(train)
+quantifier.fit(*train.Xy)
 
 # evaluation
 mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae')
diff --git a/examples/11.comparing_HDy_HDx.py b/examples/11.comparing_HDy_HDx.py
index 7d96b6a..a95b780 100644
--- a/examples/11.comparing_HDy_HDx.py
+++ b/examples/11.comparing_HDy_HDx.py
@@ -23,8 +23,9 @@ qp.environ['SAMPLE_SIZE']=100
 
 df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'])
 
+datasets = qp.datasets.UCI_BINARY_DATASETS
 
-for dataset_name in tqdm(qp.datasets.UCI_BINARY_DATASETS, total=len(qp.datasets.UCI_BINARY_DATASETS)):
+for dataset_name in tqdm(datasets, total=len(datasets), desc='datasets processed'):
     if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']:
         # these datasets tend to produce either too good or too bad results...
         continue
@@ -32,23 +33,25 @@ for dataset_name in tqdm(qp.datasets.UCI_BINARY_DATASETS, total=len(qp.datasets.
     collection = qp.datasets.fetch_UCIBinaryLabelledCollection(dataset_name, verbose=False)
     train, test = collection.split_stratified()
 
+    Xtr, ytr = train.Xy
+
     # HDy............................................
     tinit = time()
-    hdy = HDy(LogisticRegression()).fit(train)
+    hdy = HDy(LogisticRegression()).fit(Xtr, ytr)
     t_hdy_train = time()-tinit
 
     tinit = time()
-    hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean()
+    hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean(numeric_only=True)
     t_hdy_test = time() - tinit
     df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test]
 
     # HDx............................................
     tinit = time()
-    hdx = DMx.HDx(n_jobs=-1).fit(train)
+    hdx = DMx.HDx(n_jobs=-1).fit(Xtr, ytr)
     t_hdx_train = time() - tinit
 
     tinit = time()
-    hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean()
+    hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean(numeric_only=True)
     t_hdx_test = time() - tinit
     df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test]
 
diff --git a/examples/12.custom_protocol.py b/examples/12.custom_protocol.py
index 7824b3f..774a0ed 100644
--- a/examples/12.custom_protocol.py
+++ b/examples/12.custom_protocol.py
@@ -3,14 +3,13 @@ from sklearn.linear_model import LogisticRegression
 
 import quapy as qp
 from quapy.method.aggregative import PACC
-from quapy.data import LabelledCollection
 from quapy.protocol import AbstractStochasticSeededProtocol
 import quapy.functional as F
 
 """
 In this example, we create a custom protocol.
-The protocol generates samples of a Gaussian mixture model with random mixture parameter (the sample prevalence).
-Datapoints are univariate and we consider 2 classes only.
+The protocol generates synthetic samples of a Gaussian mixture model with random mixture parameter 
+(the sample prevalence). Datapoints are univariate and we consider 2 classes only for simplicity.
 """
 class GaussianMixProtocol(AbstractStochasticSeededProtocol):
     # We need to extend AbstractStochasticSeededProtocol if we want the samples to be replicable
@@ -81,10 +80,9 @@ with qp.util.temp_seed(0):
     Xpos = np.random.normal(loc=mu_2, scale=std_2, size=100)
     X = np.concatenate([Xneg, Xpos]).reshape(-1,1)
     y = [0]*100 + [1]*100
-    training = LabelledCollection(X, y)
 
     pacc = PACC(LogisticRegression())
-    pacc.fit(training)
+    pacc.fit(X, y)
 
 
 mae = qp.evaluation.evaluate(pacc, protocol=gm, error_metric='mae', verbose=True)
diff --git a/examples/13.plotting.py b/examples/13.plotting.py
new file mode 100644
index 0000000..77230c8
--- /dev/null
+++ b/examples/13.plotting.py
@@ -0,0 +1,73 @@
+import quapy as qp
+import numpy as np
+
+from protocol import APP
+from quapy.method.aggregative import CC, ACC, PCC, PACC
+from sklearn.svm import LinearSVC
+
+qp.environ['SAMPLE_SIZE'] = 500
+
+
+'''
+In this example, we show how to create some plots for the analysis of experimental results.
+The main functions are included in qp.plot but, before, we will generate some basic experimental data
+'''
+
+def gen_data():
+    # this function generates some experimental data to plot
+
+    def base_classifier():
+        return LinearSVC(class_weight='balanced')
+
+    def datasets():
+        # the plots can handle experiments in different datasets
+        yield qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5).train_test
+        # by uncommenting thins line, the experiments will be carried out in more than one dataset
+        # yield qp.datasets.fetch_reviews('hp', tfidf=True, min_df=5).train_test
+
+    def models():
+        yield 'CC', CC(base_classifier())
+        yield 'ACC', ACC(base_classifier())
+        yield 'PCC', PCC(base_classifier())
+        yield 'PACC', PACC(base_classifier())
+
+    # these are the main parameters we need to fill for generating the plots;
+    # note that each these list must have the same number of elements, since the ith entry of each list regards
+    # an independent experiment
+    method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
+
+    for train, test in datasets():
+        for method_name, model in models():
+            model.fit(*train.Xy)
+            true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+
+            # gather all the data for this experiment
+            method_names.append(method_name)
+            true_prevs.append(true_prev)
+            estim_prevs.append(estim_prev)
+            tr_prevs.append(train.prevalence())
+
+    return method_names, true_prevs, estim_prevs, tr_prevs
+
+# generate some experimental data
+method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
+# if you want to play around with the different plots and parameters, you might prefer to generate the data only once,
+# so you better replace the above line of code with this one, that pickles the experimental results for faster reuse
+# method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('./plots/data.pickle', gen_data)
+
+# if there is only one training prevalence, we can display it
+only_train_prev = tr_prevs[0] if len(np.unique(tr_prevs, axis=0))==1 else None
+
+# diagonal plot (useful for analyzing the performance of quantifiers on binary data)
+qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs,
+                        train_prev=only_train_prev, savepath='./plots/bin_diag.png')
+
+# bias plot (box plots displaying the bias of each method)
+qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
+
+# error by drift allows to plot the quantification error as a function of the amount of prior probability shift, and
+# is preferable than diagonal plots for multiclass datasets
+qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
+                       error_name='ae', n_bins=10, savepath='./plots/err_drift.png')
+
+# each functions return (fig, ax) objects from matplotlib; use them to customize the plots to your liking
diff --git a/examples/13.bayesian_quantification.py b/examples/14.bayesian_quantification.py
similarity index 97%
rename from examples/13.bayesian_quantification.py
rename to examples/14.bayesian_quantification.py
index 69b9932..21a1be1 100644
--- a/examples/13.bayesian_quantification.py
+++ b/examples/14.bayesian_quantification.py
@@ -13,7 +13,7 @@ $ pip install quapy[bayesian]
 Running the script via:
 
 ```
-$ python examples/13.bayesian_quantification.py
+$ python examples/14.bayesian_quantification.py
 ```
 
 will produce a plot `bayesian_quantification.pdf`.
@@ -122,18 +122,18 @@ def get_random_forest() -> RandomForestClassifier:
 def _get_estimate(estimator_class, training: LabelledCollection, test: np.ndarray) -> None:
     """Auxiliary method for running ACC and PACC."""
     estimator = estimator_class(get_random_forest())
-    estimator.fit(training)
-    return estimator.quantify(test)
+    estimator.fit(*training.Xy)
+    return estimator.predict(test)
 
 
 def train_and_plot_bayesian_quantification(ax: plt.Axes, training: LabelledCollection, test: LabelledCollection) -> None:
     """Fits Bayesian quantification and plots posterior mean as well as individual samples"""
     print('training model Bayesian CC...', end='')
     quantifier = BayesianCC(classifier=get_random_forest())
-    quantifier.fit(training)
+    quantifier.fit(*training.Xy)
 
     # Obtain mean prediction
-    mean_prediction = quantifier.quantify(test.X)
+    mean_prediction = quantifier.predict(test.X)
     mae = qp.error.mae(test.prevalence(), mean_prediction)
     x_ax = np.arange(training.n_classes)
     ax.plot(x_ax, mean_prediction, c="salmon", linewidth=2, linestyle=":", label="Bayesian")
diff --git a/examples/14.composable_methods.py b/examples/15.composable_methods.py
similarity index 87%
rename from examples/14.composable_methods.py
rename to examples/15.composable_methods.py
index 5ffcb94..e8340d4 100644
--- a/examples/14.composable_methods.py
+++ b/examples/15.composable_methods.py
@@ -1,6 +1,6 @@
 """
 This example illustrates the composition of quantification methods from
-arbitrary loss functions and feature transformations. It will extend the basic
+arbitrary loss functions and feature representations. It will extend the basic
 example on the usage of quapy with this composition.
 
 This example requires the installation of qunfold, the back-end of QuaPy's
@@ -8,7 +8,7 @@ composition module:
 
     pip install --upgrade pip setuptools wheel
     pip install "jax[cpu]"
-    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5"
 """
 
 import numpy as np
@@ -24,20 +24,20 @@ data = qp.data.preprocessing.text2tfidf(
 training, testing = data.train_test
 
 # We start by recovering PACC from its building blocks, a LeastSquaresLoss and
-# a probabilistic ClassTransformer. A 5-fold cross-validation is implemented
+# a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented
 # through a CVClassifier.
 
 from quapy.method.composable import (
     ComposableQuantifier,
     LeastSquaresLoss,
-    ClassTransformer,
+    ClassRepresentation,
     CVClassifier,
 )
 from sklearn.linear_model import LogisticRegression
 
 pacc = ComposableQuantifier(
     LeastSquaresLoss(),
-    ClassTransformer(
+    ClassRepresentation(
         CVClassifier(LogisticRegression(random_state=0), 5),
         is_probabilistic = True
     ),
@@ -63,7 +63,7 @@ from quapy.method.composable import HellingerSurrogateLoss
 
 model = ComposableQuantifier(
     HellingerSurrogateLoss(), # the loss is different from before
-    ClassTransformer( # we use the same transformer
+    ClassRepresentation( # we use the same representation
         CVClassifier(LogisticRegression(random_state=0), 5),
         is_probabilistic = True
     ),
@@ -79,7 +79,7 @@ absolute_errors = qp.evaluation.evaluate(
 print(f"MAE = {np.mean(absolute_errors):.4f}+-{np.std(absolute_errors):.4f}")
 
 # In general, any composed method solves a linear system of equations by
-# minimizing the loss after transforming the data. Methods of this kind include
+# minimizing the loss after representing the data. Methods of this kind include
 # ACC, PACC, HDx, HDy, and many other well-known methods, as well as an
 # unlimited number of re-combinations of their building blocks.
 
@@ -93,18 +93,18 @@ from quapy.method.composable import CombinedLoss
 
 model = ComposableQuantifier(
     CombinedLoss(HellingerSurrogateLoss(), LeastSquaresLoss()),
-    ClassTransformer(
+    ClassRepresentation(
         CVClassifier(LogisticRegression(random_state=0), 5),
         is_probabilistic = True
     ),
 )
 
-from qunfold.quapy import QuaPyWrapper
-from qunfold import GenericMethod
+from quapy.method.composable import QUnfoldWrapper
+from qunfold import LinearMethod
 
-model = QuaPyWrapper(GenericMethod(
+model = QUnfoldWrapper(LinearMethod(
     CombinedLoss(HellingerSurrogateLoss(), LeastSquaresLoss()),
-    ClassTransformer(
+    ClassRepresentation(
         CVClassifier(LogisticRegression(random_state=0), 5),
         is_probabilistic = True
     ),
@@ -115,7 +115,7 @@ model = QuaPyWrapper(GenericMethod(
 
 param_grid = {
     "loss__weights": [ (w, 1-w) for w in [.1, .5, .9] ],
-    "transformer__classifier__estimator__C": [1e-1, 1e1],
+    "representation__classifier__estimator__C": [1e-1, 1e1],
 }
 
 grid_search = qp.model_selection.GridSearchQ(
diff --git a/examples/15.confidence_regions.py b/examples/16.confidence_regions.py
similarity index 95%
rename from examples/15.confidence_regions.py
rename to examples/16.confidence_regions.py
index b437592..27fbfbd 100644
--- a/examples/15.confidence_regions.py
+++ b/examples/16.confidence_regions.py
@@ -20,6 +20,7 @@ Let see one example:
 # load some data
 data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
 train, test = data.train_test
+Xtr, ytr = train.Xy
 
 # by simply wrapping an aggregative quantifier within the AggregativeBootstrap class, we can obtain confidence
 # intervals around the point estimate, in this case, at 95% of confidence
@@ -27,7 +28,7 @@ pacc = AggregativeBootstrap(PACC(), n_test_samples=500, confidence_level=0.95)
 
 with qp.util.temp_seed(0):
     # we train the quantifier the usual way
-    pacc.fit(train)
+    pacc.fit(Xtr, ytr)
 
     # let us simulate some shift in the test data
     random_prevalence = F.uniform_prevalence_sampling(n_classes=test.n_classes)
@@ -51,7 +52,7 @@ with qp.util.temp_seed(0):
     print(f'point-estimate:   {F.strprev(pred_prev)}')
     print(f'absolute error:   {error:.3f}')
     print(f'Is the true value in the confidence region?: {conf_intervals.coverage(true_prev)==1}')
-    print(f'Proportion of simplex covered at {pacc.confidence_level*100:.1f}%: {conf_intervals.simplex_portion()*100:.2f}%')
+    print(f'Proportion of simplex covered at confidence level {pacc.confidence_level*100:.1f}%: {conf_intervals.simplex_portion()*100:.2f}%')
 
 """
 Final remarks: 
diff --git a/examples/5.explicit_loss_minimization.py b/examples/17.explicit_loss_minimization.py
similarity index 93%
rename from examples/5.explicit_loss_minimization.py
rename to examples/17.explicit_loss_minimization.py
index f8f210d..b38728d 100644
--- a/examples/5.explicit_loss_minimization.py
+++ b/examples/17.explicit_loss_minimization.py
@@ -50,7 +50,7 @@ train_modsel, val = qp.datasets.fetch_twitter('hcr', for_model_selection=True, p
 model selection: 
 We explore the classifier's loss and the classifier's C hyperparameters.
 Since our model is actually an instance of OneVsAllAggregative, we need to add the prefix "binary_quantifier", and
-since our binary quantifier is an instance of CC, we need to add the prefix "classifier".
+since our binary quantifier is an instance of CC (an aggregative quantifier), we need to add the prefix "classifier".
 """
 param_grid = {
     'binary_quantifier__classifier__loss': ['q', 'kld', 'mae'],  # classifier-dependent hyperparameter
@@ -58,11 +58,11 @@ param_grid = {
 }
 print('starting model selection')
 model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False)
-quantifier = model_selection.fit(train_modsel).best_model()
+quantifier = model_selection.fit(*train_modsel.Xy).best_model()
 
 print('training on the whole training set')
 train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test
-quantifier.fit(train)
+quantifier.fit(*train.Xy)
 
 # evaluation
 mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae')
diff --git a/examples/2.custom_quantifier.py b/examples/2.custom_quantifier.py
index 9c89714..ac6f7b5 100644
--- a/examples/2.custom_quantifier.py
+++ b/examples/2.custom_quantifier.py
@@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier
 from quapy.model_selection import GridSearchQ
 from quapy.method.aggregative import AggregativeSoftQuantifier
 from quapy.protocol import APP
+import quapy.functional as F
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from time import time
@@ -30,19 +31,19 @@ class MyQuantifier(BaseQuantifier):
         self.alpha = alpha
         self.classifier = classifier
 
-    # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
-    # val_split=None); this would amount to:
-    def fit(self, data: LabelledCollection):
-        assert data.n_classes==2, \
+    # in general, we would need to implement the method fit(self, X, y); this would amount to:
+    def fit(self, X, y):
+        n_classes = F.num_classes_from_labels(y)
+        assert n_classes==2, \
             'this quantifier is only valid for binary problems [abort]'
-        self.classifier.fit(*data.Xy)
+        self.classifier.fit(X, y)
         return self
 
     # in general, we would need to implement the method quantify(self, instances); this would amount to:
-    def quantify(self, instances):
+    def predict(self, X):
         assert hasattr(self.classifier, 'predict_proba'), \
             'the underlying classifier is not probabilistic! [abort]'
-        posterior_probabilities = self.classifier.predict_proba(instances)
+        posterior_probabilities = self.classifier.predict_proba(X)
         positive_probabilities = posterior_probabilities[:, 1]
         crisp_decisions = positive_probabilities > self.alpha
         pos_prev = crisp_decisions.mean()
@@ -57,9 +58,11 @@ class MyQuantifier(BaseQuantifier):
 # of the method, now adhering to the AggregativeSoftQuantifier:
 
 class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
+
     def __init__(self, classifier, alpha=0.5):
-        # aggregative quantifiers have an internal attribute called self.classifier
-        self.classifier = classifier
+        # aggregative quantifiers have an internal attribute called self.classifier, but this is defined
+        # within the super's init
+        super().__init__(classifier, fit_classifier=True, val_split=None)
         self.alpha = alpha
 
     # since this method is of type aggregative, we can simply implement the method aggregation_fit, which
@@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
     # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
     # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
     # basic functionality for checking binary consistency.
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         pass
 
     # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
@@ -94,7 +97,7 @@ if __name__ == '__main__':
     train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
     train, val = train.split_stratified(train_prop=0.75)  # let's create a validation set for optimizing hyperparams
 
-    def test_implementation(quantifier):
+    def try_implementation(quantifier):
         class_name = quantifier.__class__.__name__
         print(f'\ntesting implementation {class_name}...')
         # model selection
@@ -104,7 +107,7 @@ if __name__ == '__main__':
             'alpha': np.linspace(0, 1, 11),         # quantifier-dependent hyperparameter
             'classifier__C': np.logspace(-2, 2, 5)  # classifier-dependent hyperparameter
         }
-        gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train)
+        gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy)
         t_modsel = time() - tinit
         print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
 
@@ -112,7 +115,7 @@ if __name__ == '__main__':
         optimized_model = gridsearch.best_model_
         mae = qp.evaluation.evaluate(
             optimized_model,
-            protocol=APP(test, repeats=5000, sanity_check=None),  # disable the check, we want to generate many tests!
+            protocol=APP(test, repeats=500, sanity_check=None),  # disable the check, we want to generate many tests!
             error_metric='mae',
             verbose=True)
 
@@ -121,11 +124,11 @@ if __name__ == '__main__':
 
     # define an instance of our custom quantifier and test it!
     quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
-    test_implementation(quantifier)
+    try_implementation(quantifier)
 
     # define an instance of our custom quantifier, with the second implementation, and test it!
     quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
-    test_implementation(quantifier)
+    try_implementation(quantifier)
 
     # the output should look like this:
     """
@@ -141,7 +144,7 @@ if __name__ == '__main__':
         evaluation took 4.66s [MAE = 0.0630]
     """
     # Note that the first implementation is much slower, both in terms of grid-search optimization and in terms of
-    # evaluation. The reason why is that QuaPy is highly optimized for aggregative quantifiers (by far, the most
+    # evaluation. The reason why, is that QuaPy is highly optimized for aggregative quantifiers (by far, the most
     # popular type of quantification methods), thus significantly speeding up model selection and test routines.
     # Furthermore, it is simpler to extend an aggregation type since QuaPy implements boilerplate functions for you.
 
diff --git a/examples/3.custom_collection.py b/examples/3.custom_collection.py
new file mode 100644
index 0000000..13baeef
--- /dev/null
+++ b/examples/3.custom_collection.py
@@ -0,0 +1,103 @@
+import quapy as qp
+from quapy.method.aggregative import PACC
+from quapy.data import LabelledCollection, Dataset
+from quapy.protocol import ArtificialPrevalenceProtocol
+import quapy.functional as F
+import os
+from os.path import join
+
+# While quapy comes with ready-to-use datasets for experimental purposes, you may prefer to run experiments using
+# your own data. Most of the quapy's functionality relies on an internal class called LabelledCollection, for fast
+# indexing and sampling, and so this example provides guidance on how to convert your datasets into a LabelledCollection
+# so all the functionality becomes available. This includes procedures for tuning the hyperparameters of your methods,
+# evaluating the performance using high level sampling protocols, etc.
+
+# Let us assume that we have a binary sentiment dataset of opinions in natural language. We will use the "IMDb"
+# dataset of reviews, which can be downloaded as follows
+URL_TRAIN = f'https://zenodo.org/record/4117827/files/imdb_train.txt'
+URL_TEST = f'https://zenodo.org/record/4117827/files/imdb_test.txt'
+os.makedirs('./reviews', exist_ok=True)
+train_path = join('reviews', 'hp_train.txt')
+test_path = join('reviews', 'hp_test.txt')
+qp.util.download_file_if_not_exists(URL_TRAIN, train_path)
+qp.util.download_file_if_not_exists(URL_TEST, test_path)
+
+# these files contain 2 columns separated by a \t:
+# the first one is a binary value (0=negative, 1=positive), and the second is the text
+# Everything we need is to implement a function returning the instances and the labels as follows
+def my_data_loader(path):
+    with open(path, 'rt') as fin:
+        labels, texts = zip(*[line.split('\t') for line in fin.readlines()])
+        labels = list(map(int, labels))  # convert string numbers to int
+        return texts, labels
+
+# check that our function is working properly...
+train_texts, train_labels = my_data_loader(train_path)
+for i, (text, label) in enumerate(zip(train_texts, train_labels)):
+    print(f'#{i}: {label=}\t{text=}')
+    if i>=5:
+        print('...')
+        break
+
+# We can now instantiate a LabelledCollection simply as
+train_lc = LabelledCollection(instances=train_texts, labels=train_labels)
+print('my training collection:', train_lc)
+
+# We can instantiate directly a LabelledCollection using the data loader function,
+# without having to load the data ourselves:
+train_lc = LabelledCollection.load(train_path, loader_func=my_data_loader)
+print('my training collection:', train_lc)
+
+# We can do the same for the test set, or we can instead directly instantiate a Dataset object (this is by and large
+# simply a tuple with training and test LabelledCollections) as follows:
+my_data = Dataset.load(train_path, test_path, loader_func=my_data_loader)
+print('my dataset:', my_data)
+
+# However, since this is a textual dataset, we must vectorize it prior to training any quantification algorithm.
+# We can do this in several ways in quapy. For example, manually...
+# from sklearn.feature_extraction.text import TfidfVectorizer
+# tfidf = TfidfVectorizer(min_df=5)
+# Xtr = tfidf.fit_transform(my_data.training.instances)
+# Xte = tfidf.transform(my_data.test.instances)
+# ... or using some preprocessing functionality of quapy (recommended):
+my_data_tfidf = qp.data.preprocessing.text2tfidf(my_data, min_df=5)
+
+training, test = my_data_tfidf.train_test
+
+# Once you have loaded your training and test data, you have access to a series of quapy's utilities, e.g.:
+print(f'the training prevalence is {F.strprev(training.prevalence())}')
+print(f'the test prevalence is {F.strprev(test.prevalence())}')
+print(f'let us generate a small balanced training sample:')
+desired_size = 200
+desired_prevalence = [0.5, 0.5]
+small_training_balanced = training.sampling(desired_size, *desired_prevalence, shuffle=True, random_state=0)
+print(small_training_balanced)
+print(f'or generating train/val splits such as: {training.split_stratified(train_prop=0.7)}')
+
+# training
+print('let us train a simple quantifier:...')
+Xtr, ytr = training.Xy
+quantifier = PACC()
+quantifier.fit(Xtr, ytr)  # or: quantifier.fit(*training.Xy)
+
+# test
+print("and use quapy' evaluation functions")
+evaluation_protocol = ArtificialPrevalenceProtocol(
+    data=test,
+    sample_size=200,
+    random_state=0
+)
+
+report = qp.evaluation.evaluation_report(quantifier, protocol=evaluation_protocol, error_metrics=['ae'])
+print(report)
+print(f'mean absolute error across {len(report)} experiments: {report.mean(numeric_only=True)}')
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/4.using_pretrained_classifier.py b/examples/4.using_pretrained_classifier.py
new file mode 100644
index 0000000..5b5ead5
--- /dev/null
+++ b/examples/4.using_pretrained_classifier.py
@@ -0,0 +1,75 @@
+"""
+Aggregative quantifiers use an underlying classifier. Often, one has one pre-trained classifier available, and
+needs to use this classifier at the basis of a quantification system. In such cases, the classifier should not
+be retrained, but only used to issue classifier predictions for the quantifier.
+In this example, we show how to instantiate a quantifier with a pre-trained classifier.
+"""
+from typing import List, Dict
+
+import quapy as qp
+from quapy.method.aggregative import PACC
+from sklearn.base import BaseEstimator, ClassifierMixin
+from transformers import pipeline
+import numpy as np
+import quapy.functional as F
+
+
+# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
+class HFTextClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
+        self.pipe = pipeline("sentiment-analysis", model=model_name)
+        self.classes_ = np.asarray([0,1])
+
+    def fit(self, X, y=None):
+        return self
+
+    def _binary_decisions(self, transformer_output: List[Dict]):
+        return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)
+
+    def predict(self, X):
+        X = list(map(str, X))
+        preds = self.pipe(X, truncation=True)
+        return self._binary_decisions(preds)
+
+    def predict_proba(self, X):
+        X = list(map(str, X))
+        n_examples = len(X)
+        preds = self.pipe(X, truncation=True)
+        decisions = self._binary_decisions(preds)
+        scores = np.array([p['score'] for p in preds], dtype=float)
+        probas = np.zeros(shape=(len(X), 2), dtype=float)
+        probas[np.arange(n_examples),decisions] = scores
+        probas[np.arange(n_examples),~decisions] = 1-scores
+        return probas
+
+# load a sentiment dataset
+dataset = qp.datasets.fetch_reviews('imdb', tfidf=False)  # raw text
+train, test = dataset.training, dataset.test
+
+# instantiate a pre-trained classifier
+clf = HFTextClassifier()
+
+# Let us fit a quantifier based on our pre-trained classifier.
+# Note that, since the classifier is already fit, we will use the entire training set for
+# learning the aggregation function of the quantifier.
+# To do so, we only need to indicate "fit_classifier"=False, as follows:
+quantifier = PACC(clf, fit_classifier=False)   # Probabilistic Classify & Count using a pre-trained model
+
+print('training PACC...')
+quantifier.fit(*train.Xy)
+
+# let us simulate some shifted test data...
+new_prevalence = [0.75, 0.25]
+shifted_test = test.sampling(500, *new_prevalence, random_state=0)
+
+# and do some evaluation
+print('predicting with PACC...')
+estim_prevalence = quantifier.predict(shifted_test.X)
+
+print('Result:\n'+('='*20))
+print(f'training prevalence: {F.strprev(train.prevalence())}')
+print(f'(shifted) test prevalence: {F.strprev(shifted_test.prevalence())}')
+print(f'estimated prevalence: {F.strprev(estim_prevalence)}')
+
+absolute_error = qp.error.ae(new_prevalence, estim_prevalence)
+print(f'absolute error={absolute_error:.4f}')
\ No newline at end of file
diff --git a/examples/4.lequa2022_experiments.py b/examples/5a.lequa2022_experiments.py
similarity index 86%
rename from examples/4.lequa2022_experiments.py
rename to examples/5a.lequa2022_experiments.py
index f3eec55..40632d5 100644
--- a/examples/4.lequa2022_experiments.py
+++ b/examples/5a.lequa2022_experiments.py
@@ -15,7 +15,7 @@ https://lequa2022.github.io/index (the site of the competition)
 https://ceur-ws.org/Vol-3180/paper-146.pdf (the overview paper)
 """
 
-# there are 4 tasks (T1A, T1B, T2A, T2B)
+# there are 4 tasks (T1A, T1B, T2A, T2B), let us symply consider T1A (binary quantification, vector form)
 task = 'T1A'
 
 # set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing:
@@ -28,18 +28,19 @@ qp.environ['N_JOBS'] = -1
 # of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
 # stored in a directory.
 training, val_generator, test_generator = fetch_lequa2022(task=task)
+Xtr, ytr = training.Xy
 
 # define the quantifier
-quantifier = EMQ(classifier=LogisticRegression())
+quantifier = EMQ(classifier=LogisticRegression(), val_split=5)
 
 # model selection
 param_grid = {
     'classifier__C': np.logspace(-3, 3, 7),          # classifier-dependent: inverse of regularization strength
     'classifier__class_weight': ['balanced', None],  # classifier-dependent: weights of each class
-    'recalib': ['bcts', 'platt', None]               # quantifier-dependent: recalibration method (new in v0.1.7)
+    'calib': ['bcts', None]                          # quantifier-dependent: recalibration method (new in v0.1.7)
 }
 model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
-quantifier = model_selection.fit(training)
+quantifier = model_selection.fit(Xtr, ytr)
 
 # evaluation
 report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
@@ -50,4 +51,4 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev)
 print(report)
 
 print('Averaged values:')
-print(report.mean())
+print(report.mean(numeric_only=True))
diff --git a/examples/4b.lequa2024_experiments.py b/examples/5b.lequa2024_experiments.py
similarity index 97%
rename from examples/4b.lequa2024_experiments.py
rename to examples/5b.lequa2024_experiments.py
index 38394e3..351fed1 100644
--- a/examples/4b.lequa2024_experiments.py
+++ b/examples/5b.lequa2024_experiments.py
@@ -1,6 +1,6 @@
+import quapy as qp
 import numpy as np
 from sklearn.linear_model import LogisticRegression
-import quapy as qp
 import quapy.functional as F
 from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024
 from quapy.evaluation import evaluation_report
@@ -14,6 +14,7 @@ LeQua competition itself, check:
 https://lequa2024.github.io/index (the site of the competition)
 """
 
+
 # there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift)
 task = 'T2'
 
@@ -27,6 +28,7 @@ qp.environ['N_JOBS'] = -1
 # of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
 # stored in a directory.
 training, val_generator, test_generator = fetch_lequa2024(task=task)
+Xtr, ytr = training.Xy
 
 # define the quantifier
 quantifier = KDEyML(classifier=LogisticRegression())
@@ -37,8 +39,9 @@ param_grid = {
     'classifier__class_weight': ['balanced', None],         # classifier-dependent: weights of each class
     'bandwidth': np.linspace(0.01, 0.2, 20)  # quantifier-dependent: bandwidth of the kernel
 }
+
 model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
-quantifier = model_selection.fit(training)
+quantifier = model_selection.fit(Xtr, ytr)
 
 # evaluation
 report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
diff --git a/examples/6.quanet_example.py b/examples/6.quanet_example.py
index 4be3132..bbcad5d 100644
--- a/examples/6.quanet_example.py
+++ b/examples/6.quanet_example.py
@@ -20,14 +20,13 @@ train, test = dataset.train_test
 # train the text classifier:
 cnn_module = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
 cnn_classifier = NeuralClassifierTrainer(cnn_module, device='cuda')
-cnn_classifier.fit(*dataset.training.Xy)
 
 # train QuaNet (alternatively, we can set fit_classifier=True and let QuaNet train the classifier)
 quantifier = QuaNet(cnn_classifier, device='cuda')
-quantifier.fit(train, fit_classifier=False)
+quantifier.fit(*train.Xy)
 
 # prediction and evaluation
-estim_prevalence = quantifier.quantify(test.instances)
+estim_prevalence = quantifier.predict(test.instances)
 mae = qp.error.mae(test.prevalence(), estim_prevalence)
 
 print(f'true prevalence: {F.strprev(test.prevalence())}')
diff --git a/examples/7.uci_experiments.py b/examples/7.uci_binary_experiments.py
similarity index 71%
rename from examples/7.uci_experiments.py
rename to examples/7.uci_binary_experiments.py
index b452feb..04e07ee 100644
--- a/examples/7.uci_experiments.py
+++ b/examples/7.uci_binary_experiments.py
@@ -1,4 +1,7 @@
 from copy import deepcopy
+from pathlib import Path
+
+import pandas as pd
 
 import quapy as qp
 from sklearn.calibration import CalibratedClassifierCV
@@ -15,6 +18,18 @@ import itertools
 import argparse
 import torch
 import shutil
+from glob import glob
+
+
+"""
+This example shows how to generate experiments for the UCI ML repository binary datasets following the protocol
+proposed in "Pérez-Gállego , P., Quevedo , J. R., and del Coz, J. J. Using ensembles for problems with characteriz-
+able changes in data distribution: A case study on quantification. Information Fusion 34 (2017), 87–100."
+
+This example covers most important steps in the experimentation pipeline, namely, the training and optimization
+of the hyperparameters of different quantifiers, and the evaluation of these quantifiers based on standard 
+prevalence sampling protocols aimed at simulating different levels of prior probability shift.
+"""
 
 
 N_JOBS = -1
@@ -28,10 +43,6 @@ def newLR():
     return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
 
 
-def calibratedLR():
-    return CalibratedClassifierCV(newLR())
-
-
 __C_range = np.logspace(-3, 3, 7)
 lr_params = {
     'classifier__C': __C_range,
@@ -50,7 +61,7 @@ def quantification_models():
     yield 'MAX', MAX(newLR()), lr_params
     yield 'MS', MS(newLR()), lr_params
     yield 'MS2', MS2(newLR()), lr_params
-    yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params
+    yield 'sldc', EMQ(newLR()), lr_params
     yield 'svmmae', newSVMAE(), svmperf_params
     yield 'hdy', HDy(newLR()), lr_params
 
@@ -74,6 +85,13 @@ def result_path(path, dataset_name, model_name, run, optim_loss):
     return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')
 
 
+def parse_result_path(path):
+    *dataset, method, run, metric = Path(path).name.split('-')
+    dataset = '-'.join(dataset)
+    run = int(run.replace('run',''))
+    return dataset, method, run, metric
+
+
 def is_already_computed(dataset_name, model_name, run, optim_loss):
     return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))
 
@@ -98,8 +116,8 @@ def run(experiment):
         print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
         # model selection (hyperparameter optimization for a quantification-oriented loss)
         train, test = data.train_test
-        train, val = train.split_stratified()
         if hyperparams is not None:
+            train, val = train.split_stratified()
             model_selection = qp.model_selection.GridSearchQ(
                 deepcopy(model),
                 param_grid=hyperparams,
@@ -107,13 +125,13 @@ def run(experiment):
                 error=optim_loss,
                 refit=True,
                 timeout=60*60,
-                verbose=True
+                verbose=False
             )
-            model_selection.fit(train)
+            model_selection.fit(*train.Xy)
             model = model_selection.best_model()
             best_params = model_selection.best_params_
         else:
-            model.fit(data.training)
+            model.fit(*train.Xy)
             best_params = {}
 
         # model evaluation
@@ -121,19 +139,37 @@ def run(experiment):
             model,
             protocol=APP(test, n_prevalences=21, repeats=100)
         )
-        test_true_prevalence = data.test.prevalence()
+        test_true_prevalence = test.prevalence()
 
         evaluate_experiment(true_prevalences, estim_prevalences)
         save_results(dataset_name, model_name, run, optim_loss,
                      true_prevalences, estim_prevalences,
-                     data.training.prevalence(), test_true_prevalence,
+                     train.prevalence(), test_true_prevalence,
                      best_params)
 
 
+def show_results(result_folder):
+    result_data = []
+    for file in glob(os.path.join(result_folder,'*.pkl')):
+        true_prevalences, estim_prevalences, *_ = pickle.load(open(file, 'rb'))
+        dataset, method, run, metric = parse_result_path(file)
+        mae = qp.error.mae(true_prevalences, estim_prevalences)
+        result_data.append({
+            'dataset': dataset,
+            'method': method,
+            'run': run,
+            metric: mae
+        })
+    df = pd.DataFrame(result_data)
+    pd.set_option("display.max_columns", None)
+    pd.set_option("display.expand_frame_repr", False)
+    print(df.pivot_table(index='dataset', columns='method', values=metric))
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
-    parser.add_argument('results', metavar='RESULT_PATH', type=str,
-                        help='path to the directory where to store the results')
+    parser.add_argument('--results', metavar='RESULT_PATH', type=str,
+                        help='path to the directory where to store the results', default='./results/uci_binary')
     parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
                         help='path to the directory with svmperf')
     parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
@@ -155,3 +191,5 @@ if __name__ == '__main__':
     qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)
 
     shutil.rmtree(args.checkpointdir, ignore_errors=True)
+
+    show_results(args.results)
diff --git a/examples/8.ucimulti_experiments.py b/examples/8.uci_multiclass_experiments.py
similarity index 88%
rename from examples/8.ucimulti_experiments.py
rename to examples/8.uci_multiclass_experiments.py
index e2a8d97..06f7ea7 100644
--- a/examples/8.ucimulti_experiments.py
+++ b/examples/8.uci_multiclass_experiments.py
@@ -1,4 +1,3 @@
-import pickle
 import os
 from time import time
 from collections import defaultdict
@@ -7,11 +6,16 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 
 import quapy as qp
-from quapy.method.aggregative import PACC, EMQ
+from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
 
+"""
+This example is the analogous counterpart of example 7 but involving multiclass quantification problems
+using datasets from the UCI ML repository.
+"""
+
 
 SEED = 1
 
@@ -31,7 +35,7 @@ def wrap_hyper(classifier_hyper_grid:dict):
 METHODS = [
     ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
     ('EMQ',  EMQ(newLR()), wrap_hyper(logreg_grid)),
-    # ('KDEy-ML',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
+    ('KDEy-ML',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
 ]
 
 
@@ -43,6 +47,7 @@ def show_results(result_path):
     pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
     print(pv)
 
+
 def load_timings(result_path):
     import pandas as pd
     timings = defaultdict(lambda: {})
@@ -59,7 +64,7 @@ if __name__ == '__main__':
     qp.environ['N_JOBS'] = -1
     n_bags_val = 250
     n_bags_test = 1000
-    result_dir = f'results/ucimulti'
+    result_dir = f'results/uci_multiclass'
 
     os.makedirs(result_dir, exist_ok=True)
 
@@ -100,7 +105,7 @@ if __name__ == '__main__':
                         
                         t_init = time()
                         try:
-                            modsel.fit(train)
+                            modsel.fit(*train.Xy)
 
                             print(f'best params {modsel.best_params_}')
                             print(f'best score {modsel.best_score_}')
@@ -108,7 +113,8 @@ if __name__ == '__main__':
                             quantifier = modsel.best_model()
                         except:
                             print('something went wrong... trying to fit the default model')
-                            quantifier.fit(train)
+                            quantifier.fit(*train.Xy)
+
                         timings[method_name][dataset] = time() - t_init
                         
 
diff --git a/examples/9.ifcb_experiments.py b/examples/9.ifcb_experiments.py
index 8fb39d1..580be6b 100644
--- a/examples/9.ifcb_experiments.py
+++ b/examples/9.ifcb_experiments.py
@@ -6,6 +6,18 @@ from sklearn.linear_model import LogisticRegression
 from quapy.model_selection import GridSearchQ
 from quapy.evaluation import evaluation_report
 
+"""
+This example shows a complete experiment using the IFCB Plankton dataset;
+see https://hlt-isti.github.io/QuaPy/manuals/datasets.html#ifcb-plankton-dataset
+
+Note that this dataset can be downloaded in two modes: for model selection or for evaluation.
+
+See also:
+Automatic plankton quantification using deep features
+P González, A Castaño, EE Peacock, J Díez, JJ Del Coz, HM Sosik
+Journal of Plankton Research 41 (4), 449-463
+"""
+
 
 print('Quantifying the IFCB dataset with PACC\n')
 
@@ -30,7 +42,7 @@ mod_sel = GridSearchQ(
     n_jobs=-1,
     verbose=True,
     raise_errors=True
-).fit(train)
+).fit(*train.Xy)
 
 print(f'model selection chose hyperparameters: {mod_sel.best_params_}')
 quantifier = mod_sel.best_model_
@@ -42,7 +54,7 @@ print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={trai
 print(f'\ttest samples={test_gen.total()}')
 
 print('training on the whole dataset before test')
-quantifier.fit(train)
+quantifier.fit(*train.Xy)
 
 print('testing...')
 report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
diff --git a/prepare_svmperf.sh b/prepare_svmperf.sh
index b609f6c..3da8bfe 100755
--- a/prepare_svmperf.sh
+++ b/prepare_svmperf.sh
@@ -11,13 +11,5 @@ rm $FILE
 patch -s -p0 < svm-perf-quantification-ext.patch
 mv svm_perf svm_perf_quantification
 cd svm_perf_quantification
-make
-
-
-
-
-
-
-
-
+make CFLAGS="-O3 -Wall -Wno-unused-result -fcommon"
 
diff --git a/quapy/__init__.py b/quapy/__init__.py
index 44a23a4..d013f5b 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -1,5 +1,4 @@
 """QuaPy module for quantification"""
-from sklearn.linear_model import LogisticRegression
 
 from quapy.data import datasets
 from . import error
@@ -14,7 +13,13 @@ from . import model_selection
 from . import classification
 import os
 
-__version__ = '0.1.10'
+__version__ = '0.2.0'
+
+
+def _default_cls():
+    from sklearn.linear_model import LogisticRegression
+    return LogisticRegression()
+
 
 environ = {
     'SAMPLE_SIZE': None,
@@ -24,7 +29,7 @@ environ = {
     'PAD_INDEX': 1,
     'SVMPERF_HOME': './svm_perf_quantification',
     'N_JOBS': int(os.getenv('N_JOBS', 1)),
-    'DEFAULT_CLS': LogisticRegression(max_iter=3000)
+    'DEFAULT_CLS': _default_cls()
 }
 
 
@@ -68,3 +73,5 @@ def _get_classifier(classifier):
     if classifier is None:
         raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
     return classifier
+
+
diff --git a/quapy/classification/svmperf.py b/quapy/classification/svmperf.py
index 6c85084..71f2ac3 100644
--- a/quapy/classification/svmperf.py
+++ b/quapy/classification/svmperf.py
@@ -33,27 +33,16 @@ class SVMperf(BaseEstimator, ClassifierMixin):
     valid_losses = {'01':0, 'f1':1, 'kld':12, 'nkld':13, 'q':22, 'qacc':23, 'qf1':24, 'qgm':25, 'mae':26, 'mrae':27}
 
     def __init__(self, svmperf_base, C=0.01, verbose=False, loss='01', host_folder=None):
-        assert exists(svmperf_base), f'path {svmperf_base} does not seem to point to a valid path'
+        assert exists(svmperf_base), \
+            (f'path {svmperf_base} does not seem to point to a valid path;'
+             f'did you install svm-perf? '
+             f'see instructions in https://hlt-isti.github.io/QuaPy/manuals/explicit-loss-minimization.html')
         self.svmperf_base = svmperf_base
         self.C = C
         self.verbose = verbose
         self.loss = loss
         self.host_folder = host_folder
 
-    # def set_params(self, **parameters):
-    #     """
-    #     Set the hyper-parameters for svm-perf. Currently, only the `C` and `loss` parameters are supported
-    #
-    #     :param parameters: a `**kwargs` dictionary `{'C': <float>}`
-    #     """
-    #     assert sorted(list(parameters.keys())) == ['C', 'loss'], \
-    #         'currently, only the C and loss parameters are supported'
-    #     self.C = parameters.get('C', self.C)
-    #     self.loss = parameters.get('loss', self.loss)
-    #
-    # def get_params(self, deep=True):
-    #     return {'C': self.C, 'loss': self.loss}
-
     def fit(self, X, y):
         """
         Trains the SVM for the multivariate performance loss
diff --git a/quapy/data/base.py b/quapy/data/base.py
index ceb7402..82c57db 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
 from numpy.random import RandomState
 from quapy.functional import strprev
 from quapy.util import temp_seed
+import quapy.functional as F
 
 
 class LabelledCollection:
@@ -34,8 +35,7 @@ class LabelledCollection:
         self.labels = np.asarray(labels)
         n_docs = len(self)
         if classes is None:
-            self.classes_ = np.unique(self.labels)
-            self.classes_.sort()
+            self.classes_ = F.classes_from_labels(self.labels)
         else:
             self.classes_ = np.unique(np.asarray(classes))
             self.classes_.sort()
@@ -95,6 +95,15 @@ class LabelledCollection:
         """
         return len(self.classes_)
 
+    @property
+    def n_instances(self):
+        """
+        The number of instances
+
+        :return: integer
+        """
+        return len(self.labels)
+
     @property
     def binary(self):
         """
@@ -232,11 +241,11 @@ class LabelledCollection:
         :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
             second one with `1-train_prop` elements
         """
-        tr_docs, te_docs, tr_labels, te_labels = train_test_split(
+        tr_X, te_X, tr_y, te_y = train_test_split(
             self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state
         )
-        training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_)
-        test = LabelledCollection(te_docs, te_labels, classes=self.classes_)
+        training = LabelledCollection(tr_X, tr_y, classes=self.classes_)
+        test = LabelledCollection(te_X, te_y, classes=self.classes_)
         return training, test
 
     def split_random(self, train_prop=0.6, random_state=None):
@@ -318,6 +327,15 @@ class LabelledCollection:
         classes = np.unique(labels).sort()
         return LabelledCollection(instances, labels, classes=classes)
 
+    @property
+    def classes(self):
+        """
+        Gets an array-like with the classes used in this collection
+
+        :return: array-like
+        """
+        return self.classes_
+
     @property
     def Xy(self):
         """
@@ -414,6 +432,11 @@ class LabelledCollection:
             test = self.sampling_from_index(test_index)
             yield train, test
 
+    def __repr__(self):
+        repr=f'<{self.n_instances} instances (dtype={type(self.instances[0])}), '
+        repr+=f'n_classes={self.n_classes} {self.classes_}, prevalence={F.strprev(self.prevalence())}>'
+        return repr
+
 
 class Dataset:
     """
@@ -567,4 +590,7 @@ class Dataset:
             *self.test.prevalence(),
             random_state = random_state
         )
-        return self
\ No newline at end of file
+        return self
+
+    def __repr__(self):
+        return f'training={self.training}; test={self.test}'
\ No newline at end of file
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 8f4c29c..c08748f 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -114,7 +114,8 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
     """
     Loads a Reviews dataset as a Dataset instance, as used in
     `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
-    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
+    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
+    <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
     The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
 
     :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
@@ -548,25 +549,20 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, standardize=
         """
         if name == "acute.a":
             X, y = data["X"], data["y"][:, 0]
-            # X, y = Xy[:, :-2], Xy[:, -2]
         elif name == "acute.b":
             X, y = data["X"], data["y"][:, 1]
-            # X, y = Xy[:, :-2], Xy[:, -1]
         elif name == "wine-q-red":
             X, y, color = data["X"], data["y"], data["color"]
-            # X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1]
             red_idx = color == "red"
             X, y = X[red_idx, :], y[red_idx]
             y = (y > 5).astype(int)
         elif name == "wine-q-white":
             X, y, color = data["X"], data["y"], data["color"]
-            # X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1]
             white_idx = color == "white"
             X, y = X[white_idx, :], y[white_idx]
             y = (y > 5).astype(int)
         else:
             X, y = data["X"], data["y"]
-            # X, y = Xy[:, :-1], Xy[:, -1]
 
         y = binarize(y, pos_class=pos_class[name])
 
@@ -797,7 +793,7 @@ def _array_replace(arr, repl={"yes": 1, "no": 0}):
 
 def fetch_lequa2022(task, data_home=None):
     """
-    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
+    Loads the official datasets provided for the `LeQua 2022 <https://lequa2022.github.io/index>`_ competition.
     In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
     problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
     Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
@@ -817,7 +813,7 @@ def fetch_lequa2022(task, data_home=None):
         ~/quay_data/ directory)
     :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
         :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
-        :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
+        :class:`quapy.data._lequa.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
         that return a series of samples stored in a directory which are labelled by prevalence.
     """
 
@@ -839,7 +835,9 @@ def fetch_lequa2022(task, data_home=None):
         tmp_path = join(lequa_dir, task + '_tmp.zip')
         download_file_if_not_exists(url, tmp_path)
         with zipfile.ZipFile(tmp_path) as file:
+            print(f'Unzipping {tmp_path}...', end='')
             file.extractall(unzipped_path)
+            print(f'[done]')
         os.remove(tmp_path)
 
     if not os.path.exists(join(lequa_dir, task)):
@@ -867,6 +865,35 @@ def fetch_lequa2022(task, data_home=None):
 
 
 def fetch_lequa2024(task, data_home=None, merge_T3=False):
+    """
+    Loads the official datasets provided for the `LeQua 2024 <https://lequa2024.github.io/index>`_ competition.
+    LeQua 2024 defines four tasks (T1, T2, T3, T4) related to the problem of quantification;
+    all tasks are affected by some type of dataset shift. Tasks T1 and T2 are akin to tasks T1A and T1B of LeQua 2022,
+    while T3 and T4 are new tasks introduced in LeQua 2024.
+
+    - Task T1 evaluates binary quantifiers under prior probability shift (akin to T1A of LeQua 2022).
+    - Task T2 evaluates single-label multi-class quantifiers (for n > 2 classes) under prior probability shift (akin to T1B of LeQua 2022).
+    - Task T3 evaluates ordinal quantifiers, where the classes are totally ordered.
+    - Task T4 also evaluates binary quantifiers, but under some mix of covariate shift and prior probability shift.
+
+    For a broader discussion, we refer to the `online official documentation <https://lequa2024.github.io/tasks/>`_
+
+    The datasets are downloaded only once, and stored locally for future reuse.
+
+    See `4b.lequa2024_experiments.py` provided in the example folder, which can serve as a guide on how to use these
+    datasets.
+
+    :param task: a string representing the task name; valid ones are T1, T2, T3, and T4
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quapy_data/ directory)
+    :param merge_T3: bool, if False (default), returns a generator of training collections, corresponding to natural
+        groups of reviews; if True, returns one single :class:`quapy.data.base.LabelledCollection` representing the
+        entire training set, as a concatenation of all the training collections
+    :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
+        :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
+        :class:`quapy.data._lequa.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
+        that return a series of samples stored in a directory which are labelled by prevalence.
+    """
 
     from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir
 
@@ -909,11 +936,7 @@ def fetch_lequa2024(task, data_home=None, merge_T3=False):
     test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
     test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
 
-    if task != 'T3':
-        tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
-        train = LabelledCollection.load(tr_path, loader_func=load_fn)
-        return train, val_gen, test_gen
-    else:
+    if task == 'T3':
         training_samples_path = join(lequa_dir, task, 'public', 'training_samples')
         training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt')
         train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn)
@@ -922,7 +945,10 @@ def fetch_lequa2024(task, data_home=None, merge_T3=False):
             return train, val_gen, test_gen
         else:
             return train_gen, val_gen, test_gen
-
+    else:
+        tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
+        train = LabelledCollection.load(tr_path, loader_func=load_fn)
+        return train, val_gen, test_gen
 
 
 def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
diff --git a/quapy/error.py b/quapy/error.py
index 201ab8f..eb42cd6 100644
--- a/quapy/error.py
+++ b/quapy/error.py
@@ -45,89 +45,95 @@ def acce(y_true, y_pred):
     return 1. - (y_true == y_pred).mean()
 
 
-def mae(prevs, prevs_hat):
+def mae(prevs_true, prevs_hat):
     """Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs.
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
     :return: mean absolute error
     """
-    return ae(prevs, prevs_hat).mean()
+    return ae(prevs_true, prevs_hat).mean()
 
 
-def ae(prevs, prevs_hat):
+def ae(prevs_true, prevs_hat):
     """Computes the absolute error between the two prevalence vectors.
      Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}`  is computed as
      :math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`,
      where :math:`\\mathcal{Y}` are the classes of interest.
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :return: absolute error
     """
-    assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
-    return abs(prevs_hat - prevs).mean(axis=-1)
+    prevs_true = np.asarray(prevs_true)
+    prevs_hat = np.asarray(prevs_hat)
+    assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}'
+    return abs(prevs_hat - prevs_true).mean(axis=-1)
 
 
-def nae(prevs, prevs_hat):
+def nae(prevs_true, prevs_hat):
     """Computes the normalized absolute error between the two prevalence vectors.
      Normalized absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}`  is computed as
      :math:`NAE(p,\\hat{p})=\\frac{AE(p,\\hat{p})}{z_{AE}}`,
      where :math:`z_{AE}=\\frac{2(1-\\min_{y\\in \\mathcal{Y}} p(y))}{|\\mathcal{Y}|}`, and :math:`\\mathcal{Y}`
      are the classes of interest.
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :return: normalized absolute error
     """
-    assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}'
-    return abs(prevs_hat - prevs).sum(axis=-1)/(2*(1-prevs.min(axis=-1)))
+    prevs_true = np.asarray(prevs_true)
+    prevs_hat = np.asarray(prevs_hat)
+    assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}'
+    return abs(prevs_hat - prevs_true).sum(axis=-1)/(2 * (1 - prevs_true.min(axis=-1)))
 
 
-def mnae(prevs, prevs_hat):
+def mnae(prevs_true, prevs_hat):
     """Computes the mean normalized absolute error (see :meth:`quapy.error.nae`) across the sample pairs.
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
     :return: mean normalized absolute error
     """
-    return nae(prevs, prevs_hat).mean()
+    return nae(prevs_true, prevs_hat).mean()
 
 
-def mse(prevs, prevs_hat):
+def mse(prevs_true, prevs_hat):
     """Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs.
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the
         true prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the
         predicted prevalence values
     :return: mean squared error
     """
-    return se(prevs, prevs_hat).mean()
+    return se(prevs_true, prevs_hat).mean()
 
 
-def se(prevs, prevs_hat):
+def se(prevs_true, prevs_hat):
     """Computes the squared error between the two prevalence vectors.
      Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}`  is computed as
      :math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`,
      where
      :math:`\\mathcal{Y}` are the classes of interest.
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :return: absolute error
     """
-    return ((prevs_hat - prevs) ** 2).mean(axis=-1)
+    prevs_true = np.asarray(prevs_true)
+    prevs_hat = np.asarray(prevs_hat)
+    return ((prevs_hat - prevs_true) ** 2).mean(axis=-1)
 
 
-def mkld(prevs, prevs_hat, eps=None):
+def mkld(prevs_true, prevs_hat, eps=None):
     """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the
     sample pairs. The distributions are smoothed using the `eps` factor
     (see :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true
         prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
@@ -137,10 +143,10 @@ def mkld(prevs, prevs_hat, eps=None):
         (which has thus to be set beforehand).
     :return: mean Kullback-Leibler distribution
     """
-    return kld(prevs, prevs_hat, eps).mean()
+    return kld(prevs_true, prevs_hat, eps).mean()
 
 
-def kld(prevs, prevs_hat, eps=None):
+def kld(prevs_true, prevs_hat, eps=None):
     """Computes the Kullback-Leibler divergence between the two prevalence distributions.
      Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}`
      is computed as
@@ -149,7 +155,7 @@ def kld(prevs, prevs_hat, eps=None):
      where :math:`\\mathcal{Y}` are the classes of interest.
      The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain
         zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size.
@@ -158,17 +164,17 @@ def kld(prevs, prevs_hat, eps=None):
     :return: Kullback-Leibler divergence between the two distributions
     """
     eps = __check_eps(eps)
-    smooth_prevs = smooth(prevs, eps)
+    smooth_prevs = smooth(prevs_true, eps)
     smooth_prevs_hat = smooth(prevs_hat, eps)
     return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)
 
 
-def mnkld(prevs, prevs_hat, eps=None):
+def mnkld(prevs_true, prevs_hat, eps=None):
     """Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`)
     across the sample pairs. The distributions are smoothed using the `eps` factor
     (see :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
     :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain
@@ -177,10 +183,10 @@ def mnkld(prevs, prevs_hat, eps=None):
         (which has thus to be set beforehand).
     :return: mean Normalized Kullback-Leibler distribution
     """
-    return nkld(prevs, prevs_hat, eps).mean()
+    return nkld(prevs_true, prevs_hat, eps).mean()
 
 
-def nkld(prevs, prevs_hat, eps=None):
+def nkld(prevs_true, prevs_hat, eps=None):
     """Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions.
      Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and
      :math:`\\hat{p}` is computed as
@@ -189,7 +195,7 @@ def nkld(prevs, prevs_hat, eps=None):
      :math:`\\mathcal{Y}` are the classes of interest.
      The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :param eps: smoothing factor. NKLD is not defined in cases in which the distributions
         contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample
@@ -197,16 +203,16 @@ def nkld(prevs, prevs_hat, eps=None):
         `SAMPLE_SIZE` (which has thus to be set beforehand).
     :return: Normalized Kullback-Leibler divergence between the two distributions
     """
-    ekld = np.exp(kld(prevs, prevs_hat, eps))
+    ekld = np.exp(kld(prevs_true, prevs_hat, eps))
     return 2. * ekld / (1 + ekld) - 1.
 
 
-def mrae(prevs, prevs_hat, eps=None):
+def mrae(prevs_true, prevs_hat, eps=None):
     """Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across
     the sample pairs. The distributions are smoothed using the `eps` factor (see
     :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true
         prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
@@ -216,10 +222,10 @@ def mrae(prevs, prevs_hat, eps=None):
         the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
     :return: mean relative absolute error
     """
-    return rae(prevs, prevs_hat, eps).mean()
+    return rae(prevs_true, prevs_hat, eps).mean()
 
 
-def rae(prevs, prevs_hat, eps=None):
+def rae(prevs_true, prevs_hat, eps=None):
     """Computes the absolute relative error between the two prevalence vectors.
      Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}`
      is computed as
@@ -228,7 +234,7 @@ def rae(prevs, prevs_hat, eps=None):
      where :math:`\\mathcal{Y}` are the classes of interest.
      The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :param eps: smoothing factor. `rae` is not defined in cases in which the true distribution
         contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the
@@ -237,12 +243,12 @@ def rae(prevs, prevs_hat, eps=None):
     :return: relative absolute error
     """
     eps = __check_eps(eps)
-    prevs = smooth(prevs, eps)
+    prevs_true = smooth(prevs_true, eps)
     prevs_hat = smooth(prevs_hat, eps)
-    return (abs(prevs - prevs_hat) / prevs).mean(axis=-1)
+    return (abs(prevs_true - prevs_hat) / prevs_true).mean(axis=-1)
 
 
-def nrae(prevs, prevs_hat, eps=None):
+def nrae(prevs_true, prevs_hat, eps=None):
     """Computes the normalized absolute relative error between the two prevalence vectors.
      Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}`
      is computed as
@@ -252,7 +258,7 @@ def nrae(prevs, prevs_hat, eps=None):
      and :math:`\\mathcal{Y}` are the classes of interest.
      The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :param eps: smoothing factor. `nrae` is not defined in cases in which the true distribution
         contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the
@@ -261,18 +267,18 @@ def nrae(prevs, prevs_hat, eps=None):
     :return: normalized relative absolute error
     """
     eps = __check_eps(eps)
-    prevs = smooth(prevs, eps)
+    prevs_true = smooth(prevs_true, eps)
     prevs_hat = smooth(prevs_hat, eps)
-    min_p = prevs.min(axis=-1)
-    return (abs(prevs - prevs_hat) / prevs).sum(axis=-1)/(prevs.shape[-1]-1+(1-min_p)/min_p)
+    min_p = prevs_true.min(axis=-1)
+    return (abs(prevs_true - prevs_hat) / prevs_true).sum(axis=-1)/(prevs_true.shape[-1] - 1 + (1 - min_p) / min_p)
 
 
-def mnrae(prevs, prevs_hat, eps=None):
+def mnrae(prevs_true, prevs_hat, eps=None):
     """Computes the mean normalized relative absolute error (see :meth:`quapy.error.nrae`) across
     the sample pairs. The distributions are smoothed using the `eps` factor (see
     :meth:`quapy.error.smooth`).
 
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true
         prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
@@ -282,57 +288,61 @@ def mnrae(prevs, prevs_hat, eps=None):
         the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand).
     :return: mean normalized relative absolute error
     """
-    return nrae(prevs, prevs_hat, eps).mean()
+    return nrae(prevs_true, prevs_hat, eps).mean()
 
 
-def nmd(prevs, prevs_hat):
+def nmd(prevs_true, prevs_hat):
     """
     Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor
     `1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction).
 
-    :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)`  with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)`  with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
     :return: float in [0,1]
     """
-    n = prevs.shape[-1]
-    return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))
+    prevs_true = np.asarray(prevs_true)
+    prevs_hat = np.asarray(prevs_hat)
+    n = prevs_true.shape[-1]
+    return (1./(n-1))*np.mean(match_distance(prevs_true, prevs_hat))
 
 
-def bias_binary(prevs, prevs_hat):
+def bias_binary(prevs_true, prevs_hat):
     """
     Computes the (positive) bias in a binary problem. The bias is simply the difference between the
     predicted positive value and the true positive value, so that a positive such value indicates the
     prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise.
     :math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`,
-    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
         prevalence values
     :return: binary bias
     """
-    assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
-    return prevs_hat[...,1]-prevs[...,1]
+    prevs_true = np.asarray(prevs_true)
+    prevs_hat = np.asarray(prevs_hat)
+    assert prevs_true.shape[-1] == 2 and prevs_true.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
+    return prevs_hat[...,1]-prevs_true[...,1]
 
 
-def mean_bias_binary(prevs, prevs_hat):
+def mean_bias_binary(prevs_true, prevs_hat):
     """
     Computes the mean of the (positive) bias in a binary problem.
-    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
     :return: mean binary bias
     """
-    return np.mean(bias_binary(prevs, prevs_hat))
+    return np.mean(bias_binary(prevs_true, prevs_hat))
 
 
-def md(prevs, prevs_hat, ERROR_TOL=1E-3):
+def md(prevs_true, prevs_hat, ERROR_TOL=1E-3):
     """
     Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
     all cases.
 
-    :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)`  with the true prevalence values
+    :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)`  with the true prevalence values
     :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
     :return: float
     """
-    P = np.cumsum(prevs, axis=-1)
+    P = np.cumsum(prevs_true, axis=-1)
     P_hat = np.cumsum(prevs_hat, axis=-1)
     assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \
         'arg error in match_distance: the array does not represent a valid distribution'
@@ -349,6 +359,7 @@ def smooth(prevs, eps):
     :param eps: smoothing factor
     :return: array-like of shape `(n_classes,)` with the smoothed distribution
     """
+    prevs = np.asarray(prevs)
     n_classes = prevs.shape[-1]
     return (prevs + eps) / (eps * n_classes + 1)
 
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index 5290e06..2805555 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -63,7 +63,7 @@ def prediction(
         protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
         return __prediction_helper(model.aggregate, protocol_with_predictions, verbose)
     else:
-        return __prediction_helper(model.quantify, protocol, verbose)
+        return __prediction_helper(model.predict, protocol, verbose)
 
 
 def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
diff --git a/quapy/functional.py b/quapy/functional.py
index fac6647..408c62a 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -7,6 +7,29 @@ import scipy
 import numpy as np
 
 
+# ------------------------------------------------------------------------------------------
+# General utils
+# ------------------------------------------------------------------------------------------
+
+def classes_from_labels(labels):
+    """
+    Obtains a np.ndarray with the (sorted) classes
+    :param labels: array-like with the instances' labels
+    :return: a sorted np.ndarray with the class labels
+    """
+    classes = np.unique(labels)
+    classes.sort()
+    return classes
+
+
+def num_classes_from_labels(labels):
+    """
+    Obtains the number of classes from an array-like of instance's labels
+    :param labels: array-like with the instances' labels
+    :return: int, the number of classes
+    """
+    return len(classes_from_labels(labels))
+
 # ------------------------------------------------------------------------------------------
 # Counter utils
 # ------------------------------------------------------------------------------------------
diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py
index f352ca5..b95cb24 100644
--- a/quapy/method/__init__.py
+++ b/quapy/method/__init__.py
@@ -1,3 +1,7 @@
+import warnings
+from sklearn.exceptions import ConvergenceWarning
+warnings.simplefilter("ignore", ConvergenceWarning)
+
 from . import confidence
 from . import base
 from . import aggregative
@@ -63,3 +67,5 @@ QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS | META_ME
 
 
 
+
+
diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py
index a396324..88613c2 100644
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@@ -1,13 +1,8 @@
-from typing import Union
 import numpy as np
-from scipy.optimize import optimize, minimize_scalar
-
-from quapy.protocol import UPP
 from sklearn.base import BaseEstimator
 from sklearn.neighbors import KernelDensity
 
 import quapy as qp
-from quapy.data import LabelledCollection
 from quapy.method.aggregative import AggregativeSoftQuantifier
 import quapy.functional as F
 
@@ -102,82 +97,29 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
 
     which corresponds to the maximum likelihood estimate.
 
-    :param classifier: a sklearn's Estimator that generates a binary classifier.
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
     :param val_split: specifies the data used for generating classifier predictions. This specification
         can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
         be extracted from the training set; or as an integer (default 5), indicating that the predictions
         are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     :param bandwidth: float, the bandwidth of the Kernel
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
-        self.bandwidth = bandwidth
-        if bandwidth!='auto':
-            self.bandwidth = KDEBase._check_bandwidth(bandwidth)
-
-        assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \
-            (f'param {auto_reduction=} should either be None (no reduction) or a positive integer '
-             f'(number of training instances).')
-
-        self.auto_reduction = auto_reduction
-        self.auto_repeats = auto_repeats
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
+                 random_state=None):
+        super().__init__(classifier, fit_classifier, val_split)
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
         self.random_state=random_state
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        if self.bandwidth == 'auto':
-            self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions)
-        else:
-            self.bandwidth_val = self.bandwidth
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val)
+    def aggregation_fit(self, classif_predictions, labels):
+        self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
         return self
 
-    def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
-        train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
-        n_classes = classif_predictions.n_classes
-        epsilon = 1e-8
-        repeats = self.auto_repeats
-
-        auto_reduction = self.auto_reduction
-        if auto_reduction is None:
-            auto_reduction = len(classif_predictions)
-        else:
-            # reduce samples to speed up computation
-            train = train.sampling(auto_reduction)
-
-        prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state)
-
-        def eval_bandwidth_nll(bandwidth):
-            mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
-            loss_accum = 0
-            for (sample, prevtrue) in prot():
-                test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
-
-                def neg_loglikelihood_prev(prev):
-                    test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
-                    test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
-                    nll = -np.sum(test_loglikelihood)
-                    return nll
-
-                pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True)
-                loss_accum += neglikelihood
-            return loss_accum
-
-        r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005})
-        best_band = r.x
-        best_loss_value = r.fun
-        nit = r.nit
-
-        # print(f'[{self.__class__.__name__}:autobandwidth] '
-        #       f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
-
-        return best_band
-
     def aggregate(self, posteriors: np.ndarray):
         """
         Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
@@ -230,35 +172,35 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
     where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r`  the
     uniform distribution.
 
-    :param classifier: a sklearn's Estimator that generates a binary classifier.
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
     :param val_split: specifies the data used for generating classifier predictions. This specification
         can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
         be extracted from the training set; or as an integer (default 5), indicating that the predictions
         are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     :param bandwidth: float, the bandwidth of the Kernel
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     :param montecarlo_trials: number of Monte Carlo trials (default 10000)
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, divergence: str='HD',
                  bandwidth=0.1, random_state=None, montecarlo_trials=10000):
-        
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+
+        super().__init__(classifier, fit_classifier, val_split)
         self.divergence = divergence
         self.bandwidth = KDEBase._check_bandwidth(bandwidth)
         self.random_state=random_state
         self.montecarlo_trials = montecarlo_trials
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
+    def aggregation_fit(self, classif_predictions, labels):
+        self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
 
         N = self.montecarlo_trials
         rs = self.random_state
-        n = data.n_classes
+        n = len(self.classes_)
         self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities])
         self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities])
         self.reference_density = np.mean(self.reference_classwise_densities, axis=0)  # equiv. to (uniform @ self.reference_classwise_densities)
@@ -322,20 +264,20 @@ class KDEyCS(AggregativeSoftQuantifier):
 
     The authors showed that this distribution matching admits a closed-form solution
 
-    :param classifier: a sklearn's Estimator that generates a binary classifier.
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
     :param val_split: specifies the data used for generating classifier predictions. This specification
         can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
         be extracted from the training set; or as an integer (default 5), indicating that the predictions
         are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     :param bandwidth: float, the bandwidth of the Kernel
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1):
+        super().__init__(classifier, fit_classifier, val_split)
         self.bandwidth = KDEBase._check_bandwidth(bandwidth)
 
     def gram_matrix_mix_sum(self, X, Y=None):
@@ -350,17 +292,17 @@ class KDEyCS(AggregativeSoftQuantifier):
         gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
         return gram.sum()
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
 
-        P, y = classif_predictions.Xy
-        n = data.n_classes
+        P, y = classif_predictions, labels
+        n = len(self.classes_)
 
         assert all(sorted(np.unique(y)) == np.arange(n)), \
             'label name gaps not allowed in current implementation'
 
         # counts_inv keeps track of the relative weight of each datapoint within its class
         # (i.e., the weight in its KDE model)
-        counts_inv = 1 / (data.counts())
+        counts_inv = 1 / (F.counts_from_labels(y, classes=self.classes_))
 
         # tr_tr_sums corresponds to symbol \overline{B} in the paper
         tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
diff --git a/quapy/method/_neural.py b/quapy/method/_neural.py
index 28d848a..404090f 100644
--- a/quapy/method/_neural.py
+++ b/quapy/method/_neural.py
@@ -21,13 +21,13 @@ class QuaNetTrainer(BaseQuantifier):
     Example:
 
     >>> import quapy as qp
-    >>> from quapy.method_name.meta import QuaNet
+    >>> from quapy.method.meta import QuaNet
     >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
     >>>
     >>> # use samples of 100 elements
     >>> qp.environ['SAMPLE_SIZE'] = 100
     >>>
-    >>> # load the kindle dataset as text, and convert words to numerical indexes
+    >>> # load the Kindle dataset as text, and convert words to numerical indexes
     >>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
     >>> qp.train.preprocessing.index(dataset, min_df=5, inplace=True)
     >>>
@@ -37,12 +37,14 @@ class QuaNetTrainer(BaseQuantifier):
     >>>
     >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
     >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
-    >>> model.fit(dataset.training)
-    >>> estim_prevalence = model.quantify(dataset.test.instances)
+    >>> model.fit(*dataset.training.Xy)
+    >>> estim_prevalence = model.predict(dataset.test.instances)
 
     :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
         `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and
         `transform` (i.e., that can generate embedded representations of the unlabelled instances).
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
     :param sample_size: integer, the sample size; default is None, meaning that the sample size should be
         taken from qp.environ["SAMPLE_SIZE"]
     :param n_epochs: integer, maximum number of training epochs
@@ -64,6 +66,7 @@ class QuaNetTrainer(BaseQuantifier):
 
     def __init__(self,
                  classifier,
+                 fit_classifier=True,
                  sample_size=None,
                  n_epochs=100,
                  tr_iter_per_poch=500,
@@ -86,6 +89,7 @@ class QuaNetTrainer(BaseQuantifier):
             f'the classifier {classifier.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
                 f'since it does not implement the method "predict_proba"'
         self.classifier = classifier
+        self.fit_classifier = fit_classifier
         self.sample_size = qp._get_sample_size(sample_size)
         self.n_epochs = n_epochs
         self.tr_iter = tr_iter_per_poch
@@ -111,20 +115,21 @@ class QuaNetTrainer(BaseQuantifier):
         self.__check_params_colision(self.quanet_params, self.classifier.get_params())
         self._classes_ = None
 
-    def fit(self, data: LabelledCollection, fit_classifier=True):
+    def fit(self, X, y):
         """
         Trains QuaNet.
 
-        :param data: the training data on which to train QuaNet. If `fit_classifier=True`, the data will be split in
+        :param X: the training instances on which to train QuaNet. If `fit_classifier=True`, the data will be split in
             40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If
             `fit_classifier=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively.
-        :param fit_classifier: if True, trains the classifier on a split containing 40% of the data
+        :param y: the labels of X
         :return: self
         """
+        data = LabelledCollection(X, y)
         self._classes_ = data.classes_
         os.makedirs(self.checkpointdir, exist_ok=True)
 
-        if fit_classifier:
+        if self.fit_classifier:
             classifier_data, unused_data = data.split_stratified(0.4)
             train_data, valid_data = unused_data.split_stratified(0.66)  # 0.66 split of 60% makes 40% and 20%
             self.classifier.fit(*classifier_data.Xy)
@@ -144,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier):
         train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
 
         self.quantifiers = {
-            'cc': CC(self.classifier).fit(None, fit_classifier=False),
-            'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
-            'pcc': PCC(self.classifier).fit(None, fit_classifier=False),
-            'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
+            'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
+            'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
+            'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
+            'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
         }
         if classifier_data is not None:
-            self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False)
+            self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy)
 
         self.status = {
             'tr-loss': -1,
@@ -201,9 +206,9 @@ class QuaNetTrainer(BaseQuantifier):
 
         return prevs_estim
 
-    def quantify(self, instances):
-        posteriors = self.classifier.predict_proba(instances)
-        embeddings = self.classifier.transform(instances)
+    def predict(self, X):
+        posteriors = self.classifier.predict_proba(X)
+        embeddings = self.classifier.transform(X)
         quant_estims = self._get_aggregative_estims(posteriors)
         self.quanet.eval()
         with torch.no_grad():
diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py
index 72d0c95..628f01a 100644
--- a/quapy/method/_threshold_optim.py
+++ b/quapy/method/_threshold_optim.py
@@ -18,18 +18,23 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
     that would allow for more true positives and many more false positives, on the grounds this
     would deliver larger denominators.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
-        misclassification rates are to be estimated.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the misclassification rates should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
+    :param n_jobs: number of parallel workers
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None):
+        super().__init__(classifier, fit_classifier, val_split)
         self.n_jobs = qp._get_njobs(n_jobs)
 
     @abstractmethod
@@ -115,8 +120,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
             return 0
         return FP / (FP + TN)
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        decision_scores, y = classif_predictions.Xy
+    def aggregation_fit(self, classif_predictions, labels):
+        decision_scores, y = classif_predictions, labels
         # the standard behavior is to keep the best threshold only
         self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0]
         return self
@@ -134,17 +139,22 @@ class T50(ThresholdOptimization):
     for the threshold that makes `tpr` closest to 0.5.
     The goal is to bring improved stability to the denominator of the adjustment.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
-        misclassification rates are to be estimated.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the misclassification rates should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        super().__init__(classifier, val_split)
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
         return abs(tpr - 0.5)
@@ -158,17 +168,20 @@ class MAX(ThresholdOptimization):
     for the threshold that maximizes `tpr-fpr`.
     The goal is to bring improved stability to the denominator of the adjustment.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
-        misclassification rates are to be estimated.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the misclassification rates should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        super().__init__(classifier, val_split)
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
         # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)
@@ -183,17 +196,20 @@ class X(ThresholdOptimization):
     for the threshold that yields `tpr=1-fpr`.
     The goal is to bring improved stability to the denominator of the adjustment.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
-        misclassification rates are to be estimated.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the misclassification rates should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        super().__init__(classifier, val_split)
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
         return abs(1 - (tpr + fpr))
@@ -207,22 +223,25 @@ class MS(ThresholdOptimization):
     class prevalence estimates for all decision thresholds and returns the median of them all.
     The goal is to bring improved stability to the denominator of the adjustment.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
-        misclassification rates are to be estimated.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the misclassification rates should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     """
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        super().__init__(classifier, val_split)
+
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
         return 1
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        decision_scores, y = classif_predictions.Xy
+    def aggregation_fit(self, classif_predictions, labels):
+        decision_scores, y = classif_predictions, labels
         # keeps all candidates
         tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y)
         self.tprs = tprs_fprs_thresholds[:, 0]
@@ -246,16 +265,19 @@ class MS2(MS):
     which `tpr-fpr>0.25`
     The goal is to bring improved stability to the denominator of the adjustment.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
-        misclassification rates are to be estimated.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the misclassification rates should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     """
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        super().__init__(classifier, val_split)
+
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
 
     def discard(self, tpr, fpr) -> bool:
         return (tpr-fpr) <= 0.25
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 80e2e08..aa4d816 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -1,12 +1,16 @@
 from abc import ABC, abstractmethod
+from argparse import ArgumentError
 from copy import deepcopy
 from typing import Callable, Literal, Union
 import numpy as np
 from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from numpy.f2py.crackfortran import true_intent_list
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
+from sklearn.exceptions import NotFittedError
 from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import cross_val_predict
+from sklearn.model_selection import cross_val_predict, train_test_split
+from sklearn.utils.validation import check_is_fitted
 
 import quapy as qp
 import quapy.functional as F
@@ -14,6 +18,11 @@ from quapy.functional import get_divergence
 from quapy.classification.svmperf import SVMperf
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
+from quapy.method import _bayesian
+
+# import warnings
+# from sklearn.exceptions import ConvergenceWarning
+# warnings.filterwarnings("ignore", category=ConvergenceWarning)
 
 
 # Abstract classes
@@ -33,20 +42,80 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
 
     The method :meth:`quantify` comes with a default implementation based on :meth:`classify`
     and :meth:`aggregate`.
+
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
+        None when the method does not require any validation data, in order to avoid that some portion of
+        the training data be wasted.
     """
 
-    val_split_ = None
+    def __init__(self,
+                 classifier: Union[None,BaseEstimator],
+                 fit_classifier:bool=True,
+                 val_split:Union[int,float,tuple,None]=5):
 
-    @property
-    def val_split(self):
-        return self.val_split_
+        self.classifier = qp._get_classifier(classifier)
+        self.fit_classifier = fit_classifier
+        self.val_split = val_split
 
-    @val_split.setter
-    def val_split(self, val_split):
-        if isinstance(val_split, LabelledCollection):
-            print('warning: setting val_split with a LabelledCollection will be inefficient in'
-                  'model selection. Rather pass the LabelledCollection at fit time')
-        self.val_split_ = val_split
+        # basic type checks
+        assert hasattr(self.classifier, 'fit'), \
+            f'the classifier does not implement "fit"'
+
+        assert isinstance(fit_classifier, bool), \
+            f'unexpected type for {fit_classifier=}; must be True or False'
+
+        # val_split is indicated as a number of folds for cross-validation
+        if isinstance(val_split, int):
+            assert val_split > 1, \
+                (f'when {val_split=} is indicated as an integer, it represents the number of folds in a kFCV '
+                 f'and must thus be >1')
+            if val_split==5 and not fit_classifier:
+                print(f'Warning: {val_split=} will be ignored when the classifier is already trained '
+                      f'({fit_classifier=}). Parameter {self.val_split=} will be set to None. Set {val_split=} '
+                      f'to None to avoid this warning.')
+                self.val_split=None
+            if val_split!=5:
+                assert fit_classifier, (f'Parameter {val_split=} has been modified, but {fit_classifier=} '
+                                        f'indicates the classifier should not be retrained.')
+        # val_split is indicated as a fraction of validation instances
+        elif isinstance(val_split, float):
+            assert 0 < val_split < 1, \
+                (f'when {val_split=} is indicated as a float, it represents the fraction of training instances '
+                 f'to be used for validation, and must thus be in the range (0,1)')
+            assert fit_classifier, (f'when {val_split=} is indicated as a float (the fraction of training instances '
+                                    f'to be used for validation), the parameter {fit_classifier=} must be True')
+        # val_split is indicated as a validation collection (X,y)
+        elif isinstance(val_split, tuple):
+            assert len(val_split) == 2, \
+                (f'when {val_split=} is indicated as a tuple, it represents the collection (X,y) on which the '
+                 f'validation must be performed, but this seems to have different cardinality')
+        elif val_split is None:
+            pass
+        else:
+            raise ValueError(f'unexpected type for {val_split=}')
+
+        # classifier is fitted?
+        try:
+            check_is_fitted(self.classifier)
+            fitted = True
+        except NotFittedError:
+            fitted = False
+
+        # consistency checks: fit_classifier?
+        if self.fit_classifier:
+            if fitted:
+                raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested')
+        else:
+            assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
+                            f'but this does not seem to be')
 
     def _check_init_parameters(self):
         """
@@ -58,124 +127,89 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
         """
         pass
 
-    def _check_non_empty_classes(self, data: LabelledCollection):
+    def _check_non_empty_classes(self, y):
         """
         Asserts all classes have positive instances.
 
-        :param data: LabelledCollection
+        :param labels: array-like of shape `(n_instances,)` with the label for each instance
+        :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when
+            some classes have no examples.
         :return: Nothing. May raise an exception.
         """
-        sample_prevs = data.prevalence()
-        empty_classes = np.argwhere(sample_prevs==0).flatten()
-        if len(empty_classes)>0:
-            empty_class_names = data.classes_[empty_classes]
+        sample_prevs = F.prevalence_from_labels(y, self.classes_)
+        empty_classes = np.argwhere(sample_prevs == 0).flatten()
+        if len(empty_classes) > 0:
+            empty_class_names = self.classes_[empty_classes]
             raise ValueError(f'classes {empty_class_names} have no training examples')
 
-    def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
+    def fit(self, X, y):
         """
-        Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
+        Trains the aggregative quantifier. This comes down to training a classifier (if requested) and an
+        aggregation function.
 
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
-        :param fit_classifier: whether to train the learner (default is True). Set to False if the
-            learner has been trained outside the quantifier.
-        :param val_split: specifies the data used for generating classifier predictions. This specification
-            can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
-            be extracted from the training set; or as an integer (default 5), indicating that the predictions
-            are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-            for `k`); or as a collection defining the specific set of data to use for validation.
-            Alternatively, this set can be specified at fit time by indicating the exact set of data
-            on which the predictions are to be generated.
+        :param X: array-like of shape `(n_samples, n_features)`, the training instances
+        :param y: array-like of shape `(n_samples,)`, the labels
         :return: self
         """
         self._check_init_parameters()
-        classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
-        self.aggregation_fit(classif_predictions, data)
+        classif_predictions, labels = self.classifier_fit_predict(X, y)
+        self.aggregation_fit(classif_predictions, labels)
         return self
 
-    def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
+    def classifier_fit_predict(self, X, y):
         """
         Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to
         train the aggregation function.
 
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
-        :param fit_classifier: whether to train the learner (default is True). Set to False if the
-            learner has been trained outside the quantifier.
-        :param predict_on: specifies the set on which predictions need to be issued. This parameter can
-            be specified as None (default) to indicate no prediction is needed; a float in (0, 1) to
-            indicate the proportion of instances to be used for predictions (the remainder is used for
-            training); an integer >1 to indicate that the predictions must be generated via k-fold
-            cross-validation, using this integer as k; or the data sample itself on which to generate
-            the predictions.
+        :param X: array-like of shape `(n_samples, n_features)`, the training instances
+        :param y: array-like of shape `(n_samples,)`, the labels
         """
-        assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean'
+        self._check_classifier(adapt_if_necessary=self.fit_classifier)
 
-        self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
+        # self._check_non_empty_classes(y)
 
-        if fit_classifier:
-            self._check_non_empty_classes(data)
-
-        if predict_on is None:
-            if not fit_classifier:
-                predict_on = data
-                if isinstance(self.val_split, LabelledCollection) and self.val_split!=predict_on:
-                    raise ValueError(f'{fit_classifier=} but a LabelledCollection was provided as val_split '
-                                     f'in __init__ that is not the same as the LabelledCollection provided in fit.')
-        if predict_on is None:
-            predict_on = self.val_split
-
-        if predict_on is None:
-            if fit_classifier:
-                self.classifier.fit(*data.Xy)
-            predictions = None
-        elif isinstance(predict_on, float):
-            if fit_classifier:
-                if not (0. < predict_on < 1.):
-                    raise ValueError(f'proportion {predict_on=} out of range, must be in (0,1)')
-                train, val = data.split_stratified(train_prop=(1 - predict_on))
-                self.classifier.fit(*train.Xy)
-                predictions = LabelledCollection(self.classify(val.X), val.y, classes=data.classes_)
+        predictions, labels = None, None
+        if isinstance(self.val_split, int):
+            assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}'
+            num_folds = self.val_split
+            n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
+            predictions = cross_val_predict(
+                self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method()
+            )
+            labels = y
+            self.classifier.fit(X, y)
+        elif isinstance(self.val_split, float):
+            assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}'
+            train_prop = 1. - self.val_split
+            Xtr, Xval, ytr, yval = train_test_split(X, y, train_size=train_prop, stratify=y)
+            self.classifier.fit(Xtr, ytr)
+            predictions = self.classify(Xval)
+            labels = yval
+        elif isinstance(self.val_split, tuple):
+            Xval, yval = self.val_split
+            if self.fit_classifier:
+                self.classifier.fit(X, y)
+            predictions = self.classify(Xval)
+            labels = yval
+        elif self.val_split is None:
+            if self.fit_classifier:
+                self.classifier.fit(X, y)
+                predictions, labels = None, None
             else:
-                raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
-                                 f'the set on which predictions have to be issued must be '
-                                 f'explicitly indicated')
-
-        elif isinstance(predict_on, LabelledCollection):
-            if fit_classifier:
-                self.classifier.fit(*data.Xy)
-            predictions = LabelledCollection(self.classify(predict_on.X), predict_on.y, classes=predict_on.classes_)
-
-        elif isinstance(predict_on, int):
-            if fit_classifier:
-                if predict_on <= 1:
-                    raise ValueError(f'invalid value {predict_on} in fit. '
-                                     f'Specify a integer >1 for kFCV estimation.')
-                else:
-                    n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
-                    predictions = cross_val_predict(
-                        self.classifier, *data.Xy, cv=predict_on, n_jobs=n_jobs, method=self._classifier_method())
-                    predictions = LabelledCollection(predictions, data.y, classes=data.classes_)
-                    self.classifier.fit(*data.Xy)
-            else:
-                raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
-                                 f'the set on which predictions have to be issued must be '
-                                 f'explicitly indicated')
-
+                predictions, labels = self.classify(X), y
         else:
-            raise ValueError(
-                f'error: param "predict_on" ({type(predict_on)}) not understood; '
-                f'use either a float indicating the split proportion, or a '
-                f'tuple (X,y) indicating the validation partition')
+            raise ValueError(f'unexpected type for {self.val_split=}')
 
-        return predictions
+        return predictions, labels
 
     @abstractmethod
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Trains the aggregation function.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the predictions issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the classification predictions
+            (whatever the method :meth:`classify` returns)
+        :param labels: array-like with the true labels associated to each classifier prediction
         """
         ...
 
@@ -197,16 +231,16 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
         """
         self.classifier_ = classifier
 
-    def classify(self, instances):
+    def classify(self, X):
         """
         Provides the label predictions for the given instances. The predictions should respect the format expected by
         :meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for
         non-probabilistic quantifiers. The default one is "decision_function".
 
-        :param instances: array-like of shape `(n_instances, n_features,)`
-        :return: np.ndarray of shape `(n_instances,)` with label predictions
+        :param X: array-like of shape `(n_samples, n_features)`, the data instances
+        :return: np.ndarray of shape `(n_instances,)` with classifier predictions
         """
-        return getattr(self.classifier, self._classifier_method())(instances)
+        return getattr(self.classifier, self._classifier_method())(X)
 
     def _classifier_method(self):
         """
@@ -221,28 +255,28 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
         Guarantees that the underlying classifier implements the method required for issuing predictions, i.e.,
         the method indicated by the :meth:`_classifier_method`
 
-        :param adapt_if_necessary: if True, the method will try to comply with the required specifications
+        :param adapt_if_necessary: unused unless overridden
         """
         assert hasattr(self.classifier, self._classifier_method()), \
             f"the method does not implement the required {self._classifier_method()} method"
 
-    def quantify(self, instances):
+    def predict(self, X):
         """
         Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated
         by the classifier.
 
-        :param instances: array-like
+        :param X: array-like of shape `(n_samples, n_features)`, the data instances
         :return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates.
         """
-        classif_predictions = self.classify(instances)
+        classif_predictions = self.classify(X)
         return self.aggregate(classif_predictions)
 
     @abstractmethod
     def aggregate(self, classif_predictions: np.ndarray):
         """
-        Implements the aggregation of label predictions.
+        Implements the aggregation of the classifier predictions.
 
-        :param classif_predictions: `np.ndarray` of label predictions
+        :param classif_predictions: `np.ndarray` of classifier predictions
         :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
         """
         ...
@@ -253,7 +287,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
         Class labels, in the same order in which class prevalence values are to be computed.
         This default implementation actually returns the class labels of the learner.
 
-        :return: array-like
+        :return: array-like, the class labels
         """
         return self.classifier.classes_
 
@@ -315,7 +349,7 @@ class AggregativeSoftQuantifier(AggregativeQuantifier, ABC):
 
 
 class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier):
-    
+
     @property
     def pos_label(self):
         return self.classifier.classes_[1]
@@ -324,9 +358,9 @@ class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier):
     def neg_label(self):
         return self.classifier.classes_[0]
 
-    def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
-        self._check_binary(data, self.__class__.__name__)
-        return super().fit(data, fit_classifier, val_split)
+    def fit(self, X, y):
+        self._check_binary(y, self.__class__.__name__)
+        return super().fit(X, y)
 
 
 # Methods
@@ -338,16 +372,15 @@ class CC(AggregativeCrispQuantifier):
 
     :param classifier: a sklearn's Estimator that generates a classifier
     """
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier: bool = True):
+        super().__init__(classifier, fit_classifier, val_split=None)
 
-    def __init__(self, classifier: BaseEstimator=None):
-        self.classifier = qp._get_classifier(classifier)
-
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Nothing to do here!
 
-        :param classif_predictions: not used
-        :param data: not used
+        :param classif_predictions: unused
+        :param labels: unused
         """
         pass
 
@@ -355,7 +388,7 @@ class CC(AggregativeCrispQuantifier):
         """
         Computes class prevalence estimates by counting the prevalence of each of the predicted labels.
 
-        :param classif_predictions: array-like with label predictions
+        :param classif_predictions: array-like with classifier predictions
         :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
         """
         return F.prevalence_from_labels(classif_predictions, self.classes_)
@@ -369,15 +402,15 @@ class PCC(AggregativeSoftQuantifier):
     :param classifier: a sklearn's Estimator that generates a classifier
     """
 
-    def __init__(self, classifier: BaseEstimator=None):
-        self.classifier = qp._get_classifier(classifier)
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier: bool = True):
+        super().__init__(classifier, fit_classifier, val_split=None)
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Nothing to do here!
 
-        :param classif_predictions: not used
-        :param data: not used
+        :param classif_predictions: unused
+        :param labels: unused
         """
         pass
 
@@ -391,15 +424,17 @@ class ACC(AggregativeCrispQuantifier):
     the "adjusted" variant of :class:`CC`, that corrects the predictions of CC
     according to the `misclassification rates`.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
 
     :param val_split: specifies the data used for generating classifier predictions. This specification
         can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
         be extracted from the training set; or as an integer (default 5), indicating that the predictions
         are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
 
     :param str method: adjustment method to be used:
 
@@ -430,17 +465,18 @@ class ACC(AggregativeCrispQuantifier):
 
     :param n_jobs: number of parallel workers
     """
+
     def __init__(
             self,
-            classifier: BaseEstimator=None,
-            val_split=5,
+            classifier: BaseEstimator = None,
+            fit_classifier = True,
+            val_split = 5,
             solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
             method: Literal['inversion', 'invariant-ratio'] = 'inversion',
             norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
             n_jobs=None,
     ):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+        super().__init__(classifier, fit_classifier, val_split)
         self.n_jobs = qp._get_njobs(n_jobs)
         self.solver = solver
         self.method = method
@@ -451,24 +487,29 @@ class ACC(AggregativeCrispQuantifier):
     NORMALIZATIONS = ['clip', 'mapsimplex', 'condsoftmax', None]
 
     @classmethod
-    def newInvariantRatioEstimation(cls, classifier: BaseEstimator, val_split=5, n_jobs=None):
+    def newInvariantRatioEstimation(cls, classifier: BaseEstimator, fit_classifier=True, val_split=5, n_jobs=None):
         """
         Constructs a quantifier that implements the Invariant Ratio Estimator of
         `Vaz et al. 2018 <https://jmlr.org/papers/v20/18-456.html>`_. This amounts
         to setting method to 'invariant-ratio' and clipping to 'project'.
 
-        :param classifier: a sklearn's Estimator that generates a classifier
+        :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+            the one indicated in `qp.environ['DEFAULT_CLS']`
+
+        :param fit_classifier: whether to train the learner (default is True). Set to False if the
+            learner has been trained outside the quantifier.
+
         :param val_split: specifies the data used for generating classifier predictions. This specification
-        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
-        be extracted from the training set; or as an integer (default 5), indicating that the predictions
-        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+            can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+            be extracted from the training set; or as an integer (default 5), indicating that the predictions
+            are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+            for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
         :param n_jobs: number of parallel workers
+
         :return: an instance of ACC configured so that it implements the Invariant Ratio Estimator
         """
-        return ACC(classifier, val_split=val_split, method='invariant-ratio', norm='mapsimplex', n_jobs=n_jobs)
+        return ACC(classifier, fit_classifier=fit_classifier, val_split=val_split, method='invariant-ratio', norm='mapsimplex', n_jobs=n_jobs)
 
     def _check_init_parameters(self):
         if self.solver not in ACC.SOLVERS:
@@ -478,16 +519,15 @@ class ACC(AggregativeCrispQuantifier):
         if self.norm not in ACC.NORMALIZATIONS:
             raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Estimates the misclassification rates.
-
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the label predictions issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the predicted labels
+        :param labels: array-like with the true labels associated to each predicted label
         """
-        pred_labels, true_labels = classif_predictions.Xy
-        self.cc = CC(self.classifier)
+        true_labels = labels
+        pred_labels = classif_predictions
+        self.cc = CC(self.classifier, fit_classifier=False)
         self.Pte_cond_estim_ = ACC.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels)
 
     @classmethod
@@ -527,14 +567,17 @@ class PACC(AggregativeSoftQuantifier):
     `Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
     the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
 
     :param val_split: specifies the data used for generating classifier predictions. This specification
         can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
         be extracted from the training set; or as an integer (default 5), indicating that the predictions
         are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`). Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
 
     :param str method: adjustment method to be used:
 
@@ -565,17 +608,18 @@ class PACC(AggregativeSoftQuantifier):
 
     :param n_jobs: number of parallel workers
     """
+
     def __init__(
             self,
-            classifier: BaseEstimator=None,
+            classifier: BaseEstimator = None,
+            fit_classifier=True,
             val_split=5,
             solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
             method: Literal['inversion', 'invariant-ratio'] = 'inversion',
             norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
             n_jobs=None
     ):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+        super().__init__(classifier, fit_classifier, val_split)
         self.n_jobs = qp._get_njobs(n_jobs)
         self.solver = solver
         self.method = method
@@ -589,16 +633,16 @@ class PACC(AggregativeSoftQuantifier):
         if self.norm not in ACC.NORMALIZATIONS:
             raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Estimates the misclassification rates
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with posterior probabilities
+        :param labels: array-like with the true labels associated to each vector of posterior probabilities
         """
-        posteriors, true_labels = classif_predictions.Xy
-        self.pcc = PCC(self.classifier)
+        posteriors = classif_predictions
+        true_labels = labels
+        self.pcc = PCC(self.classifier, fit_classifier=False)
         self.Pte_cond_estim_ = PACC.getPteCondEstim(self.classifier.classes_, true_labels, posteriors)
 
     def aggregate(self, classif_posteriors):
@@ -639,114 +683,190 @@ class EMQ(AggregativeSoftQuantifier):
     prevalence, an estimate of it obtained via k-fold cross validation (instead of the true training prevalence),
     and to recalibrate the posterior probabilities of the classifier.
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: specifies the data used for generating classifier predictions. This specification
-        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
-        be extracted from the training set; or as an integer, indicating that the predictions
-        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`, default 5); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated. This hyperparameter is only meant to be used when the
-        heuristics are to be applied, i.e., if a recalibration is required. The default value is None (meaning
-        the recalibration is not required). In case this hyperparameter is set to a value other than None, but
-        the recalibration is not required (recalib=None), a warning message will be raised.
-    :param exact_train_prev: set to True (default) for using the true training prevalence as the initial observation;
-        set to False for computing the training prevalence as an estimate of it, i.e., as the expected
-        value of the posterior probabilities of the training instances.
-    :param recalib: a string indicating the method of recalibration.
-        Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling,
-        default), "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no recalibration).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the classifier (default is True). Set to False if the
+        given classifier has already been trained.
+
+    :param val_split: specifies the data used for generating the classifier predictions on which the
+        aggregation function is to be trained. This specification can be made as float in (0, 1) indicating
+        the proportion of stratified held-out validation set to be extracted from the training set; or as
+        an integer (default 5), indicating that the predictions are to be generated in a `k`-fold
+        cross-validation manner (with this integer indicating the value for `k`); or as a tuple (X,y) defining
+        the specific set of data to use for validation. This hyperparameter is only meant to be used when
+        the heuristics are to be applied, i.e., if a calibration is required. The default value is None
+        (meaning the calibration is not required). In case this hyperparameter is set to a value other than
+        None, but the calibration is not required (calib=None), a warning message will be raised.
+
+    :param exact_train_prev: set to True (default) for using the true training prevalence as the initial
+        observation; set to False for computing the training prevalence as an estimate of it, i.e., as the
+        expected value of the posterior probabilities of the training instances.
+
+    :param calib: a string indicating the method of calibration.
+        Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling),
+        "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no calibration).
+
+    :param on_calib_error: a string indicating the policy to follow in case the calibrator fails at runtime.
+        Options include "raise" (default), in which case a RuntimeException is raised; and "backup", in which
+        case the calibrator is silently skipped.
+
     :param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to
         an integer `k` --the number of folds.
     """
 
     MAX_ITER = 1000
     EPSILON = 1e-4
+    ON_CALIB_ERROR_VALUES = ['raise', 'backup']
+    CALIB_OPTIONS = [None, 'nbvs', 'bcts', 'ts', 'vs']
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=None, exact_train_prev=True,
+                 calib=None, on_calib_error='raise', n_jobs=None):
+
+        assert calib in EMQ.CALIB_OPTIONS, \
+            f'invalid value for {calib=}; valid ones are {EMQ.CALIB_OPTIONS}'
+        assert on_calib_error in EMQ.ON_CALIB_ERROR_VALUES, \
+            f'invalid value for {on_calib_error=}; valid ones are {EMQ.ON_CALIB_ERROR_VALUES}'
+
+        super().__init__(classifier, fit_classifier, val_split)
         self.exact_train_prev = exact_train_prev
-        self.recalib = recalib
+        self.calib = calib
+        self.on_calib_error = on_calib_error
         self.n_jobs = n_jobs
 
     @classmethod
-    def EMQ_BCTS(cls, classifier: BaseEstimator, n_jobs=None):
+    def EMQ_BCTS(cls, classifier: BaseEstimator, fit_classifier=True, val_split=5, on_calib_error="raise", n_jobs=None):
         """
         Constructs an instance of EMQ using the best configuration found in the `Alexandari et al. paper
         <http://proceedings.mlr.press/v119/alexandari20a.html>`_, i.e., one that relies on Bias-Corrected Temperature
-        Scaling (BCTS) as a recalibration function, and that uses an estimate of the training prevalence instead of
+        Scaling (BCTS) as a calibration function, and that uses an estimate of the training prevalence instead of
         the true training prevalence.
 
-        :param classifier: a sklearn's Estimator that generates a classifier
-        :param n_jobs: number of parallel workers.
+        :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+            the one indicated in `qp.environ['DEFAULT_CLS']`
+
+        :param fit_classifier: whether to train the learner (default is True). Set to False if the
+            learner has been trained outside the quantifier.
+
+        :param val_split: specifies the data used for generating classifier predictions. This specification
+            can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+            be extracted from the training set; or as an integer (default 5), indicating that the predictions
+            are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+            for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
+        :param on_calib_error: a string indicating the policy to follow in case the calibrator fails at runtime.
+            Options include "raise" (default), in which case a RuntimeException is raised; and "backup", in which
+            case the calibrator is silently skipped.
+
+        :param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to
+            an integer `k` --the number of folds.
+
         :return: An instance of EMQ with BCTS
         """
-        return EMQ(classifier, val_split=5, exact_train_prev=False, recalib='bcts', n_jobs=n_jobs)
+        return EMQ(classifier, fit_classifier=fit_classifier, val_split=val_split, exact_train_prev=False,
+                   calib='bcts', on_calib_error=on_calib_error, n_jobs=n_jobs)
 
     def _check_init_parameters(self):
         if self.val_split is not None:
-            if self.exact_train_prev and self.recalib is None:
+            if self.exact_train_prev and self.calib is None:
                 raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters '
-                      f'{self.exact_train_prev=} and {self.recalib=}. This has no effect and causes an unnecessary '
-                      f'overload.')
+                                     f'{self.exact_train_prev=} and {self.calib=}. This has no effect and causes an '
+                                     f'unnecessary overload.')
         else:
-            if self.recalib is not None:
-                print(f'[warning] The parameter {self.recalib=} requires the val_split be different from None. '
+            if self.calib is not None:
+                print(f'[warning] The parameter {self.calib=} requires the val_split be different from None. '
                       f'This parameter will be set to 5. To avoid this warning, set this value to a float value '
                       f'indicating the proportion of training data to be used as validation, or to an integer '
                       f'indicating the number of folds for kFCV.')
-                self.val_split=5
+                self.val_split = 5
 
-    def classify(self, instances):
+    def classify(self, X):
         """
-        Provides the posterior probabilities for the given instances. If the classifier was required
-        to be recalibrated, then these posteriors are recalibrated accordingly.
+        Provides the posterior probabilities for the given instances. The calibration function, if required,
+        has no effect in this step, and is only involved in the aggregate method.
 
-        :param instances: array-like of shape `(n_instances, n_dimensions,)`
+        :param X: array-like of shape `(n_instances, n_dimensions,)`
         :return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities
         """
-        posteriors = self.classifier.predict_proba(instances)
-        if hasattr(self, 'calibration_function') and self.calibration_function is not None:
-            posteriors = self.calibration_function(posteriors)
-        return posteriors
+        return self.classifier.predict_proba(X)
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def classifier_fit_predict(self, X, y):
+        classif_predictions = super().classifier_fit_predict(X, y)
+        self.train_prevalence = F.prevalence_from_labels(y, classes=self.classes_)
+        return classif_predictions
+
+    def _fit_calibration(self, calibrator, P, y):
+        n_classes = len(self.classes_)
+
+        if not np.issubdtype(y.dtype, np.number):
+            y = np.searchsorted(self.classes_, y)
+
+        try:
+            self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True)
+        except Exception as e:
+            if self.on_calib_error == 'raise':
+                raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}')
+            elif self.on_calib_error == 'backup':
+                self.calibration_function = lambda P: P
+
+    def _calibrate_if_requested(self, uncalib_posteriors):
+        if hasattr(self, 'calibration_function') and self.calibration_function is not None:
+            try:
+                calib_posteriors = self.calibration_function(uncalib_posteriors)
+            except Exception as e:
+                if self.on_calib_error == 'raise':
+                    raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}')
+                elif self.on_calib_error == 'backup':
+                    calib_posteriors = uncalib_posteriors
+                else:
+                    raise ValueError(f'unexpected {self.on_calib_error=}; '
+                                     f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}')
+            return calib_posteriors
+        return uncalib_posteriors
+
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Trains the aggregation function of EMQ. This comes down to recalibrating the posterior probabilities
         ir requested.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the raw (i.e., uncalibrated) posterior probabilities
+            returned by the classifier
+        :param labels: array-like with the true labels associated to each classifier prediction
         """
-        if self.recalib is not None:
-            P, y = classif_predictions.Xy
-            if self.recalib == 'nbvs':
-                calibrator = NoBiasVectorScaling()
-            elif self.recalib == 'bcts':
-                calibrator = TempScaling(bias_positions='all')
-            elif self.recalib == 'ts':
-                calibrator = TempScaling()
-            elif self.recalib == 'vs':
-                calibrator = VectorScaling()
-            else:
-                raise ValueError('invalid param argument for recalibration method; available ones are '
-                                 '"nbvs", "bcts", "ts", and "vs".')
+        P = classif_predictions
+        y = labels
 
-            if not np.issubdtype(y.dtype, np.number):
-                y = np.searchsorted(data.classes_, y)
-            self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
+        requires_predictions = (self.calib is not None) or (not self.exact_train_prev)
+        if P is None and requires_predictions:
+            # classifier predictions were not generated because val_split=None
+            raise ArgumentError(self.val_split, self.__class__.__name__ +
+                                ": Classifier predictions for the aggregative fit were not generated because "
+                                "val_split=None. This usually happens when you enable calibrations or heuristics "
+                                "during model selection but left val_split set to its default value (None). "
+                                "Please provide one of the following values for val_split: (i) an integer >1 "
+                                "(e.g. val_split=5) for k-fold cross-validation; (ii) a float in (0,1) (e.g. "
+                                "val_split=0.3) for a proportion split; or (iii) a tuple (X, y) with explicit "
+                                "validation data")
 
-        if self.exact_train_prev:
-            self.train_prevalence = data.prevalence()
-        else:
-            train_posteriors = classif_predictions.X
-            if self.recalib is not None:
-                train_posteriors = self.calibration_function(train_posteriors)
-            self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
+        if self.calib is not None:
+            calibrator = {
+                'nbvs': NoBiasVectorScaling(),
+                'bcts': TempScaling(bias_positions='all'),
+                'ts': TempScaling(),
+                'vs': VectorScaling()
+            }.get(self.calib, None)
+
+            if calibrator is None:
+                raise ValueError(f'invalid value for {self.calib=}; valid ones are {EMQ.CALIB_OPTIONS}')
+
+            self._fit_calibration(calibrator, P, y)
+
+        if not self.exact_train_prev:
+            P = self._calibrate_if_requested(P)
+            self.train_prevalence = F.prevalence_from_probabilities(P)
 
     def aggregate(self, classif_posteriors, epsilon=EPSILON):
+        classif_posteriors = self._calibrate_if_requested(classif_posteriors)
         priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
         return priors
 
@@ -759,6 +879,7 @@ class EMQ(AggregativeSoftQuantifier):
         :return: np.ndarray of shape `(n_instances, n_classes)`
         """
         classif_posteriors = self.classify(instances)
+        classif_posteriors = self._calibrate_if_requested(classif_posteriors)
         priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
         return posteriors
 
@@ -816,24 +937,30 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
     class-conditional distributions of the posterior probabilities returned for the positive and negative validation
     examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values.
 
-    :param classifier: a sklearn's Estimator that generates a binary classifier
-    :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
-        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Trains the aggregation function of HDy.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the posterior probabilities returned by the classifier
+        :param labels: array-like with the true labels associated to each posterior
         """
-        P, y = classif_predictions.Xy
+        P, y = classif_predictions, labels
         Px = P[:, self.pos_label]  # takes only the P(y=+1|x)
         self.Pxy1 = Px[y == self.pos_label]
         self.Pxy0 = Px[y == self.neg_label]
@@ -887,19 +1014,31 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
     minimizes the distance between distributions.
     Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059>
 
-    :param classifier: a sklearn's Estimator that generates a binary classifier
-    :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
-        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
     :param n_bins: an int with the number of bins to use to compute the histograms.
+
     :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
         callable function computes the divergence between two distributions (two equally sized arrays).
+
     :param tol: a float with the tolerance for the ternary search algorithm.
+
     :param n_jobs: number of parallel workers.
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5, n_bins=8,
+                 divergence: Union[str, Callable] = 'HD', tol=1e-05, n_jobs=None):
+        super().__init__(classifier, fit_classifier, val_split)
         self.tol = tol
         self.divergence = divergence
         self.n_bins = n_bins
@@ -921,15 +1060,14 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
         # Left and right are the current bounds; the maximum is between them
         return (left + right) / 2
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Trains the aggregation function of DyS.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the posterior probabilities returned by the classifier
+        :param labels: array-like with the true labels associated to each posterior
         """
-        Px, y = classif_predictions.Xy
+        Px, y = classif_predictions, labels
         Px = Px[:, self.pos_label]  # takes only the P(y=+1|x)
         self.Pxy1 = Px[y == self.pos_label]
         self.Pxy0 = Px[y == self.neg_label]
@@ -946,7 +1084,7 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
         def distribution_distance(prev):
             Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
             return divergence(Px_train, Px_test)
-            
+
         class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
         return F.as_binary_prevalence(class1_prev)
 
@@ -957,36 +1095,42 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
     SMM is a simplification of matching distribution methods where the representation of the examples
     is created using the mean instead of a histogram (conceptually equivalent to PACC).
 
-    :param classifier: a sklearn's Estimator that generates a binary classifier.
-    :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
-        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
-      
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5):
+        super().__init__(classifier, fit_classifier, val_split)
+
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Trains the aggregation function of SMM.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the posterior probabilities returned by the classifier
+        :param labels: array-like with the true labels associated to each posterior
         """
-        Px, y = classif_predictions.Xy
+        Px, y = classif_predictions, labels
         Px = Px[:, self.pos_label]  # takes only the P(y=+1|x)
         self.Pxy1 = Px[y == self.pos_label]
         self.Pxy0 = Px[y == self.neg_label]
-        self.Pxy1_mean = np.mean(self.Pxy1)  # equiv. TPR 
+        self.Pxy1_mean = np.mean(self.Pxy1)  # equiv. TPR
         self.Pxy0_mean = np.mean(self.Pxy0)  # equiv. FPR
         return self
 
     def aggregate(self, classif_posteriors):
         Px = classif_posteriors[:, self.pos_label]  # takes only the P(y=+1|x)
         Px_mean = np.mean(Px)
-     
-        class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean)
+
+        class1_prev = (Px_mean - self.Pxy0_mean) / (self.Pxy1_mean - self.Pxy0_mean)
         return F.as_binary_prevalence(class1_prev, clip_if_necessary=True)
 
 
@@ -996,25 +1140,32 @@ class DMy(AggregativeSoftQuantifier):
     probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
     as hyperparameters.
 
-    :param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
-    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
-        validation distribution.
-        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
-        validation data, or as an integer, indicating that the validation distribution should be estimated via
-        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
-        :class:`quapy.data.base.LabelledCollection` (the split itself).
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+
+    :param fit_classifier: whether to train the learner (default is True). Set to False if the
+        learner has been trained outside the quantifier.
+
+    :param val_split: specifies the data used for generating classifier predictions. This specification
+        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
+        be extracted from the training set; or as an integer (default 5), indicating that the predictions
+        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
+        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
+
     :param nbins: number of bins used to discretize the distributions (default 8)
+
     :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
         or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
         Distance)
+
     :param cdf: whether to use CDF instead of PDF (default False)
+
     :param n_jobs: number of parallel workers (default None)
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
-                 cdf=False, search='optim_minimize', n_jobs=None):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+    def __init__(self, classifier: BaseEstimator = None, fit_classifier=True, val_split=5, nbins=8,
+                 divergence: Union[str, Callable] = 'HD', cdf=False, search='optim_minimize', n_jobs=None):
+        super().__init__(classifier, fit_classifier, val_split)
         self.nbins = nbins
         self.divergence = divergence
         self.cdf = cdf
@@ -1040,12 +1191,12 @@ class DMy(AggregativeSoftQuantifier):
             histograms.append(hist)
 
         counts = np.vstack(histograms)
-        distributions = counts/counts.sum(axis=1)[:,np.newaxis]
+        distributions = counts / counts.sum(axis=1)[:, np.newaxis]
         if self.cdf:
             distributions = np.cumsum(distributions, axis=1)
         return distributions
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Trains the aggregation function of a distribution matching method. This comes down to generating the
         validation distributions out of the training data.
@@ -1055,16 +1206,15 @@ class DMy(AggregativeSoftQuantifier):
         distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
         is the fraction of instances with a value in the `k`-th bin.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the posterior probabilities returned by the classifier
+        :param labels: array-like with the true labels associated to each posterior
         """
-        posteriors, true_labels = classif_predictions.Xy
+        posteriors, true_labels = classif_predictions, labels
         n_classes = len(self.classifier.classes_)
 
         self.validation_distribution = qp.util.parallel(
             func=self._get_distributions,
-            args=[posteriors[true_labels==cat] for cat in range(n_classes)],
+            args=[posteriors[true_labels == cat] for cat in range(n_classes)],
             n_jobs=self.n_jobs,
             backend='threading'
         )
@@ -1083,16 +1233,16 @@ class DMy(AggregativeSoftQuantifier):
         test_distribution = self._get_distributions(posteriors)
         divergence = get_divergence(self.divergence)
         n_classes, n_channels, nbins = self.validation_distribution.shape
+
         def loss(prev):
             prev = np.expand_dims(prev, axis=0)
-            mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
+            mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes, -1)).reshape(n_channels, -1)
             divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
             return np.mean(divs)
 
         return F.argmin_prevalence(loss, n_classes, method=self.search)
 
 
-
 def newELM(svmperf_base=None, loss='01', C=1):
     """
     Explicit Loss Minimization (ELM) quantifiers.
@@ -1145,6 +1295,7 @@ def newSVMQ(svmperf_base=None, C=1):
     """
     return newELM(svmperf_base, loss='q', C=C)
 
+
 def newSVMKLD(svmperf_base=None, C=1):
     """
     SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence
@@ -1195,6 +1346,7 @@ def newSVMKLD(svmperf_base=None, C=1):
     """
     return newELM(svmperf_base, loss='nkld', C=C)
 
+
 def newSVMAE(svmperf_base=None, C=1):
     """
     SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by
@@ -1219,6 +1371,7 @@ def newSVMAE(svmperf_base=None, C=1):
     """
     return newELM(svmperf_base, loss='mae', C=C)
 
+
 def newSVMRAE(svmperf_base=None, C=1):
     """
     SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first
@@ -1269,22 +1422,22 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
         self.n_jobs = qp._get_njobs(n_jobs)
         self.parallel_backend = parallel_backend
 
-    def classify(self, instances):
+    def classify(self, X):
         """
         If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
         instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
-        `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
+        `i` belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
         can end up be attributed to 0, 1, or more classes.
         If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
         and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
         posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
         probabilities are independent of each other, meaning that, in general, they do not sum up to one.
 
-        :param instances: array-like
+        :param X: array-like
         :return: `np.ndarray`
         """
 
-        classif_predictions = self._parallel(self._delayed_binary_classification, instances)
+        classif_predictions = self._parallel(self._delayed_binary_classification, X)
         if isinstance(self.binary_quantifier, AggregativeSoftQuantifier):
             return np.swapaxes(classif_predictions, 0, 1)
         else:
@@ -1294,6 +1447,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
         prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions)
         return F.normalize_prevalence(prevalences)
 
+    def aggregation_fit(self, classif_predictions, labels):
+        self._parallel(self._delayed_binary_aggregate_fit(c, classif_predictions, labels))
+        return self
+
     def _delayed_binary_classification(self, c, X):
         return self.dict_binary_quantifiers[c].classify(X)
 
@@ -1301,6 +1458,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
         # the estimation for the positive class prevalence
         return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
 
+    def _delayed_binary_aggregate_fit(self, c, classif_predictions, labels):
+        # trains the aggregation function of the cth quantifier
+        return self.dict_binary_quantifiers[c].aggregate_fit(classif_predictions[:, c], labels)
+
 
 class AggregativeMedianEstimator(BinaryQuantifier):
     """
@@ -1312,8 +1473,9 @@ class AggregativeMedianEstimator(BinaryQuantifier):
     :param base_quantifier: the base, binary quantifier
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     :param param_grid: the grid or parameters towards which the median will be computed
-    :param n_jobs: number of parllel workes
+    :param n_jobs: number of parallel workers
     """
+
     def __init__(self, base_quantifier: AggregativeQuantifier, param_grid: dict, random_state=None, n_jobs=None):
         self.base_quantifier = base_quantifier
         self.param_grid = param_grid
@@ -1328,33 +1490,32 @@ class AggregativeMedianEstimator(BinaryQuantifier):
 
     def _delayed_fit(self, args):
         with qp.util.temp_seed(self.random_state):
-            params, training = args
+            params, X, y = args
             model = deepcopy(self.base_quantifier)
             model.set_params(**params)
-            model.fit(training)
+            model.fit(X, y)
             return model
 
     def _delayed_fit_classifier(self, args):
         with qp.util.temp_seed(self.random_state):
-            cls_params, training, kwargs = args
+            cls_params, X, y = args
             model = deepcopy(self.base_quantifier)
             model.set_params(**cls_params)
-            predictions = model.classifier_fit_predict(training, **kwargs)
-            return (model, predictions)
+            predictions, labels = model.classifier_fit_predict(X, y)
+            return (model, predictions, labels)
 
     def _delayed_fit_aggregation(self, args):
         with qp.util.temp_seed(self.random_state):
-            ((model, predictions), q_params), training = args
+            ((model, predictions, y), q_params) = args
             model = deepcopy(model)
             model.set_params(**q_params)
-            model.aggregation_fit(predictions, training)
+            model.aggregation_fit(predictions, y)
             return model
 
-
-    def fit(self, training: LabelledCollection, **kwargs):
+    def fit(self, X, y):
         import itertools
 
-        self._check_binary(training, self.__class__.__name__)
+        self._check_binary(y, self.__class__.__name__)
 
         if isinstance(self.base_quantifier, AggregativeQuantifier):
             cls_configs, q_configs = qp.model_selection.group_params(self.param_grid)
@@ -1362,7 +1523,7 @@ class AggregativeMedianEstimator(BinaryQuantifier):
             if len(cls_configs) > 1:
                 models_preds = qp.util.parallel(
                     self._delayed_fit_classifier,
-                    ((params, training, kwargs) for params in cls_configs),
+                    ((params, X, y) for params in cls_configs),
                     seed=qp.environ.get('_R_SEED', None),
                     n_jobs=self.n_jobs,
                     asarray=False,
@@ -1371,12 +1532,12 @@ class AggregativeMedianEstimator(BinaryQuantifier):
             else:
                 model = self.base_quantifier
                 model.set_params(**cls_configs[0])
-                predictions = model.classifier_fit_predict(training, **kwargs)
-                models_preds = [(model, predictions)]
+                predictions, labels = model.classifier_fit_predict(X, y)
+                models_preds = [(model, predictions, labels)]
 
             self.models = qp.util.parallel(
                 self._delayed_fit_aggregation,
-                ((setup, training) for setup in itertools.product(models_preds, q_configs)),
+                itertools.product(models_preds, q_configs),
                 seed=qp.environ.get('_R_SEED', None),
                 n_jobs=self.n_jobs,
                 backend='threading'
@@ -1385,7 +1546,7 @@ class AggregativeMedianEstimator(BinaryQuantifier):
             configs = qp.model_selection.expand_grid(self.param_grid)
             self.models = qp.util.parallel(
                 self._delayed_fit,
-                ((params, training) for params in configs),
+                ((params, X, y) for params in configs),
                 seed=qp.environ.get('_R_SEED', None),
                 n_jobs=self.n_jobs,
                 backend='threading'
@@ -1394,9 +1555,9 @@ class AggregativeMedianEstimator(BinaryQuantifier):
 
     def _delayed_predict(self, args):
         model, instances = args
-        return model.quantify(instances)
+        return model.predict(instances)
 
-    def quantify(self, instances):
+    def predict(self, instances):
         prev_preds = qp.util.parallel(
             self._delayed_predict,
             ((model, instances) for model in self.models),
@@ -1407,28 +1568,27 @@ class AggregativeMedianEstimator(BinaryQuantifier):
         return np.median(prev_preds, axis=0)
 
 
-#---------------------------------------------------------------
+# ---------------------------------------------------------------
 # imports
-#---------------------------------------------------------------
+# ---------------------------------------------------------------
 
 from . import _threshold_optim
 
 T50 = _threshold_optim.T50
 MAX = _threshold_optim.MAX
-X   = _threshold_optim.X
-MS  = _threshold_optim.MS
+X = _threshold_optim.X
+MS = _threshold_optim.MS
 MS2 = _threshold_optim.MS2
 
-
 from . import _kdey
 
 KDEyML = _kdey.KDEyML
 KDEyHD = _kdey.KDEyHD
 KDEyCS = _kdey.KDEyCS
 
-#---------------------------------------------------------------
+# ---------------------------------------------------------------
 # aliases
-#---------------------------------------------------------------
+# ---------------------------------------------------------------
 
 ClassifyAndCount = CC
 AdjustedClassifyAndCount = ACC
diff --git a/quapy/method/base.py b/quapy/method/base.py
index 58cd6f1..85a0525 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -14,30 +14,40 @@ import numpy as np
 class BaseQuantifier(BaseEstimator):
     """
     Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on
-    :class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and
+    a pair X, y, the method :meth:`predict`, and the :meth:`set_params` and
     :meth:`get_params` for model selection (see :meth:`quapy.model_selection.GridSearchQ`)
     """
 
     @abstractmethod
-    def fit(self, data: LabelledCollection):
+    def fit(self, X, y):
         """
-        Trains a quantifier.
+        Generates a quantifier.
 
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param X: array-like, the training instances
+        :param y: array-like, the labels
         :return: self
         """
         ...
 
     @abstractmethod
-    def quantify(self, instances):
+    def predict(self, X):
         """
         Generate class prevalence estimates for the sample's instances
 
-        :param instances: array-like
+        :param X: array-like, the test instances
         :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
         """
         ...
 
+    def quantify(self, X):
+        """
+        Alias to :meth:`predict`, for old compatibility
+
+        :param X: array-like
+        :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
+        """
+        return self.predict(X)
+
 
 class BinaryQuantifier(BaseQuantifier):
     """
@@ -45,8 +55,9 @@ class BinaryQuantifier(BaseQuantifier):
     (typically, to be interpreted as one class and its complement).
     """
 
-    def _check_binary(self, data: LabelledCollection, quantifier_name):
-        assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
+    def _check_binary(self, y, quantifier_name):
+        n_classes = len(set(y))
+        assert n_classes==2, f'{quantifier_name} works only on problems of binary classification. ' \
                             f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
 
 
@@ -66,7 +77,7 @@ def newOneVsAll(binary_quantifier: BaseQuantifier, n_jobs=None):
 class OneVsAllGeneric(OneVsAll, BaseQuantifier):
     """
     Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
-    quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
+    quantifier for each class, and then l1-normalizes the outputs so that the class prevalence values sum up to 1.
     """
 
     def __init__(self, binary_quantifier: BaseQuantifier, n_jobs=None):
@@ -78,32 +89,32 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
         self.binary_quantifier = binary_quantifier
         self.n_jobs = qp._get_njobs(n_jobs)
 
-    def fit(self, data: LabelledCollection, fit_classifier=True):
-        assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
-        assert fit_classifier == True, 'fit_classifier must be True'
+    def fit(self, X, y):
+        self.classes = sorted(np.unique(y))
+        assert len(self.classes)!=2, f'{self.__class__.__name__} expect non-binary data'
 
-        self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
-        self._parallel(self._delayed_binary_fit, data)
+        self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in self.classes}
+        self._parallel(self._delayed_binary_fit, X, y)
         return self
 
     def _parallel(self, func, *args, **kwargs):
         return np.asarray(
             Parallel(n_jobs=self.n_jobs, backend='threading')(
-                delayed(func)(c, *args, **kwargs) for c in self.classes_
+                delayed(func)(c, *args, **kwargs) for c in self.classes
             )
         )
 
-    def quantify(self, instances):
-        prevalences = self._parallel(self._delayed_binary_predict, instances)
+    def predict(self, X):
+        prevalences = self._parallel(self._delayed_binary_predict, X)
         return qp.functional.normalize_prevalence(prevalences)
 
-    @property
-    def classes_(self):
-        return sorted(self.dict_binary_quantifiers.keys())
+    # @property
+    # def classes_(self):
+    #     return sorted(self.dict_binary_quantifiers.keys())
 
     def _delayed_binary_predict(self, c, X):
-        return self.dict_binary_quantifiers[c].quantify(X)[1]
+        return self.dict_binary_quantifiers[c].predict(X)[1]
 
-    def _delayed_binary_fit(self, c, data):
-        bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True])
-        self.dict_binary_quantifiers[c].fit(bindata)
+    def _delayed_binary_fit(self, c, X, y):
+        bindata = LabelledCollection(X, y == c, classes=[False, True])
+        self.dict_binary_quantifiers[c].fit(*bindata.Xy)
diff --git a/quapy/method/composable.py b/quapy/method/composable.py
index 5d40aad..3aacab6 100644
--- a/quapy/method/composable.py
+++ b/quapy/method/composable.py
@@ -1,13 +1,21 @@
 """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold."""
 
-_import_error_message = """qunfold, the back-end of quapy.method.composable, is not properly installed.
-
+__install_istructions = """
 To fix this error, call:
 
     pip install --upgrade pip setuptools wheel
     pip install "jax[cpu]"
-    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5"
 """
+__import_error_message = (
+        "qunfold, the back-end of quapy.method.composable, is not properly installed." + __install_istructions
+)
+__old_version_message = (
+    "The version of qunfold you have installed is not compatible with current quapy's version, "
+    "which requires qunfold>=0.1.5. " + __install_istructions
+)
+
+from packaging.version import Version
 
 try:
     import qunfold
@@ -51,7 +59,19 @@ try:
         "GaussianRFFKernelTransformer",
     ]
 except ImportError as e:
-    raise ImportError(_import_error_message) from e
+    raise ImportError(__import_error_message) from e
+
+
+def check_compatible_qunfold_version():
+    try:
+        version_str = qunfold.__version__
+    except AttributeError:
+        # versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py
+        version_str = "0.1.4"
+
+    compatible = Version(version_str) >= Version("0.1.5")
+    return compatible
+
 
 def ComposableQuantifier(loss, transformer, **kwargs):
     """A generic quantification / unfolding method that solves a linear system of equations.
@@ -99,4 +119,7 @@ def ComposableQuantifier(loss, transformer, **kwargs):
             >>>     ClassTransformer(CVClassifier(LogisticRegression(), 10))
             >>> )
         """
+    if not check_compatible_qunfold_version():
+        raise ImportError(__old_version_message)
+
     return QuaPyWrapper(qunfold.GenericMethod(loss, transformer, **kwargs))
diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py
index 79a06a3..f54768c 100644
--- a/quapy/method/confidence.py
+++ b/quapy/method/confidence.py
@@ -375,18 +375,20 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
         self.region = region
         self.random_state = random_state
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
+        data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
         self.quantifiers = []
         if self.n_train_samples==1:
-            self.quantifier.aggregation_fit(classif_predictions, data)
+            self.quantifier.aggregation_fit(classif_predictions, labels)
             self.quantifiers.append(self.quantifier)
         else:
             # model-based bootstrap (only on the aggregative part)
-            full_index = np.arange(len(data))
+            n_examples = len(data)
+            full_index = np.arange(n_examples)
             with qp.util.temp_seed(self.random_state):
                 for i in range(self.n_train_samples):
                     quantifier = copy.deepcopy(self.quantifier)
-                    index = resample(full_index, n_samples=len(data))
+                    index = resample(full_index, n_samples=n_examples)
                     classif_predictions_i = classif_predictions.sampling_from_index(index)
                     data_i = data.sampling_from_index(index)
                     quantifier.aggregation_fit(classif_predictions_i, data_i)
@@ -415,10 +417,10 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
 
         return prev_estim, conf
 
-    def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
+    def fit(self, X, y):
         self.quantifier._check_init_parameters()
-        classif_predictions = self.quantifier.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
-        self.aggregation_fit(classif_predictions, data)
+        classif_predictions, labels = self.quantifier.classifier_fit_predict(X, y)
+        self.aggregation_fit(classif_predictions, labels)
         return self
 
     def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
@@ -446,14 +448,15 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
     This method relies on extra dependencies, which have to be installed via:
     `$ pip install quapy[bayes]`
 
-    :param classifier: a sklearn's Estimator that generates a classifier
-    :param val_split: specifies the data used for generating classifier predictions. This specification
+    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
+        the one indicated in `qp.environ['DEFAULT_CLS']`
+    :param val_split:  specifies the data used for generating classifier predictions. This specification
         can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
         be extracted from the training set; or as an integer (default 5), indicating that the predictions
         are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
-        for `k`); or as a collection defining the specific set of data to use for validation.
-        Alternatively, this set can be specified at fit time by indicating the exact set of data
-        on which the predictions are to be generated.
+        for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
+        None when the method does not require any validation data, in order to avoid that some portion of
+        the training data be wasted.
     :param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
     :param num_samples: number of samples to draw from the posterior (default 1000)
     :param mcmc_seed: random seed for the MCMC sampler (default 0)
@@ -464,6 +467,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
     """
     def __init__(self,
                  classifier: BaseEstimator=None,
+                 fit_classifier=True,
                  val_split: int = 5,
                  num_warmup: int = 500,
                  num_samples: int = 1_000,
@@ -476,14 +480,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
         if num_samples <= 0:
             raise ValueError(f'parameter {num_samples=} must be a positive integer')
 
-        # if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
-        #     raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
-
         if _bayesian.DEPENDENCIES_INSTALLED is False:
-            raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
+            raise ImportError("Auxiliary dependencies are required. "
+                              "Run `$ pip install quapy[bayes]` to install them.")
 
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+        super().__init__(classifier, fit_classifier, val_split)
         self.num_warmup = num_warmup
         self.num_samples = num_samples
         self.mcmc_seed = mcmc_seed
@@ -498,16 +499,20 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
         # Dictionary with posterior samples, set when `aggregate` is provided.
         self._samples = None
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         """
         Estimates the misclassification rates.
 
-        :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
-            as instances, the label predictions issued by the classifier and, as labels, the true labels
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
+        :param classif_predictions: array-like with the label predictions returned by the classifier
+        :param labels: array-like with the true labels associated to each classifier prediction
         """
-        pred_labels, true_labels = classif_predictions.Xy
-        self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_).astype(float)
+        pred_labels = classif_predictions
+        true_labels = labels
+        self._n_and_c_labeled = confusion_matrix(
+            y_true=true_labels,
+            y_pred=pred_labels,
+            labels=self.classifier.classes_
+        ).astype(float)
 
     def sample_from_posterior(self, classif_predictions):
         if self._n_and_c_labeled is None:
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index 05dd06c..37749e1 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -52,19 +52,19 @@ class MedianEstimator2(BinaryQuantifier):
 
     def _delayed_fit(self, args):
         with qp.util.temp_seed(self.random_state):
-            params, training = args
+            params, X, y = args
             model = deepcopy(self.base_quantifier)
             model.set_params(**params)
-            model.fit(training)
+            model.fit(X, y)
             return model
 
-    def fit(self, training: LabelledCollection):
-        self._check_binary(training, self.__class__.__name__)
+    def fit(self, X, y):
+        self._check_binary(y, self.__class__.__name__)
 
         configs = qp.model_selection.expand_grid(self.param_grid)
         self.models = qp.util.parallel(
             self._delayed_fit,
-            ((params, training) for params in configs),
+            ((params, X, y) for params in configs),
             seed=qp.environ.get('_R_SEED', None),
             n_jobs=self.n_jobs
         )
@@ -72,12 +72,12 @@ class MedianEstimator2(BinaryQuantifier):
 
     def _delayed_predict(self, args):
         model, instances = args
-        return model.quantify(instances)
+        return model.predict(instances)
 
-    def quantify(self, instances):
+    def predict(self, X):
         prev_preds = qp.util.parallel(
             self._delayed_predict,
-            ((model, instances) for model in self.models),
+            ((model, X) for model in self.models),
             seed=qp.environ.get('_R_SEED', None),
             n_jobs=self.n_jobs
         )
@@ -95,7 +95,7 @@ class MedianEstimator(BinaryQuantifier):
     :param base_quantifier: the base, binary quantifier
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     :param param_grid: the grid or parameters towards which the median will be computed
-    :param n_jobs: number of parllel workes
+    :param n_jobs: number of parallel workers
     """
     def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None):
         self.base_quantifier = base_quantifier
@@ -111,75 +111,33 @@ class MedianEstimator(BinaryQuantifier):
 
     def _delayed_fit(self, args):
         with qp.util.temp_seed(self.random_state):
-            params, training = args
+            params, X, y = args
             model = deepcopy(self.base_quantifier)
             model.set_params(**params)
-            model.fit(training)
+            model.fit(X, y)
             return model
 
-    def _delayed_fit_classifier(self, args):
-        with qp.util.temp_seed(self.random_state):
-            cls_params, training = args
-            model = deepcopy(self.base_quantifier)
-            model.set_params(**cls_params)
-            predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
-            return (model, predictions)
+    def fit(self, X, y):
+        self._check_binary(y, self.__class__.__name__)
 
-    def _delayed_fit_aggregation(self, args):
-        with qp.util.temp_seed(self.random_state):
-            ((model, predictions), q_params), training = args
-            model = deepcopy(model)
-            model.set_params(**q_params)
-            model.aggregation_fit(predictions, training)
-            return model
-
-
-    def fit(self, training: LabelledCollection):
-        self._check_binary(training, self.__class__.__name__)
-
-        if isinstance(self.base_quantifier, AggregativeQuantifier):
-            cls_configs, q_configs = qp.model_selection.group_params(self.param_grid)
-
-            if len(cls_configs) > 1:
-                models_preds = qp.util.parallel(
-                    self._delayed_fit_classifier,
-                    ((params, training) for params in cls_configs),
-                    seed=qp.environ.get('_R_SEED', None),
-                    n_jobs=self.n_jobs,
-                    asarray=False
-                )
-            else:
-                model = self.base_quantifier
-                model.set_params(**cls_configs[0])
-                predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
-                models_preds = [(model, predictions)]
-
-            self.models = qp.util.parallel(
-                self._delayed_fit_aggregation,
-                ((setup, training) for setup in itertools.product(models_preds, q_configs)),
-                seed=qp.environ.get('_R_SEED', None),
-                n_jobs=self.n_jobs,
-                asarray=False
-            )
-        else:
-            configs = qp.model_selection.expand_grid(self.param_grid)
-            self.models = qp.util.parallel(
-                self._delayed_fit,
-                ((params, training) for params in configs),
-                seed=qp.environ.get('_R_SEED', None),
-                n_jobs=self.n_jobs,
-                asarray=False
-            )
+        configs = qp.model_selection.expand_grid(self.param_grid)
+        self.models = qp.util.parallel(
+            self._delayed_fit,
+            ((params, X, y) for params in configs),
+            seed=qp.environ.get('_R_SEED', None),
+            n_jobs=self.n_jobs,
+            asarray=False
+        )
         return self
 
     def _delayed_predict(self, args):
         model, instances = args
-        return model.quantify(instances)
+        return model.predict(instances)
 
-    def quantify(self, instances):
+    def predict(self, X):
         prev_preds = qp.util.parallel(
             self._delayed_predict,
-            ((model, instances) for model in self.models),
+            ((model, X) for model in self.models),
             seed=qp.environ.get('_R_SEED', None),
             n_jobs=self.n_jobs,
             asarray=False
@@ -257,13 +215,14 @@ class Ensemble(BaseQuantifier):
         if self.verbose:
             print('[Ensemble]' + msg)
 
-    def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
+    def fit(self, X, y):
+
+        data = LabelledCollection(X, y)
 
         if self.policy == 'ds' and not data.binary:
             raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
 
-        if val_split is None:
-            val_split = self.val_split
+        val_split = self.val_split
 
         # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
         # min_pos positive examples)
@@ -294,15 +253,15 @@ class Ensemble(BaseQuantifier):
         self._sout('Fit [Done]')
         return self
 
-    def quantify(self, instances):
+    def predict(self, X):
         predictions = np.asarray(
-            qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
+            qp.util.parallel(_delayed_quantify, ((Qi, X) for Qi in self.ensemble), n_jobs=self.n_jobs)
         )
 
         if self.policy == 'ptr':
             predictions = self._ptr_policy(predictions)
         elif self.policy == 'ds':
-            predictions = self._ds_policy(predictions, instances)
+            predictions = self._ds_policy(predictions, X)
 
         predictions = np.mean(predictions, axis=0)
         return F.normalize_prevalence(predictions)
@@ -455,22 +414,22 @@ def _delayed_new_instance(args):
     sample = data.sampling_from_index(sample_index)
 
     if val_split is not None:
-        model.fit(sample, val_split=val_split)
+        model.fit(*sample.Xy, val_split=val_split)
     else:
-        model.fit(sample)
+        model.fit(*sample.Xy)
 
     tr_prevalence = sample.prevalence()
     tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
 
     if verbose:
-        print(f'\t\--fit-ended for prev {F.strprev(prev)}')
+        print(f'\t--fit-ended for prev {F.strprev(prev)}')
 
     return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
 
 
 def _delayed_quantify(args):
     quantifier, instances = args
-    return quantifier[0].quantify(instances)
+    return quantifier[0].predict(instances)
 
 
 def _draw_simplex(ndim, min_val, max_trials=100):
@@ -716,10 +675,10 @@ class SCMQ(AggregativeSoftQuantifier):
         self.merge_fun = merge_fun
         self.val_split = val_split
 
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
         for quantifier in self.quantifiers:
             quantifier.classifier = self.classifier
-            quantifier.aggregation_fit(classif_predictions, data)
+            quantifier.aggregation_fit(classif_predictions, labels)
         return self
 
     def aggregate(self, classif_predictions: np.ndarray):
diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py
index 4104a3f..eff2283 100644
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@@ -20,21 +20,23 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
     def __init__(self):
         self._classes_ = None
 
-    def fit(self, data: LabelledCollection):
+    def fit(self, X, y):
         """
         Computes the training prevalence and stores it.
 
-        :param data: the training sample
+        :param X: array-like of shape `(n_samples, n_features)`, the training instances
+        :param y: array-like of shape `(n_samples,)`, the labels
         :return: self
         """
-        self.estimated_prevalence = data.prevalence()
+        self._classes_ = F.classes_from_labels(labels=y)
+        self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_)
         return self
 
-    def quantify(self, instances):
+    def predict(self, X):
         """
         Ignores the input instances and returns, as the class prevalence estimantes, the training prevalence.
 
-        :param instances: array-like (ignored)
+        :param X: array-like (ignored)
         :return: the class prevalence seen during training
         """
         return self.estimated_prevalence
@@ -100,7 +102,7 @@ class DMx(BaseQuantifier):
 
         return distributions
 
-    def fit(self, data: LabelledCollection):
+    def fit(self, X, y):
         """
         Generates the validation distributions out of the training data (covariates).
         The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats`
@@ -109,33 +111,33 @@ class DMx(BaseQuantifier):
         training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in
         training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin.
 
-        :param data: the training set
+        :param X: array-like of shape `(n_samples, n_features)`, the training instances
+        :param y: array-like of shape `(n_samples,)`, the labels
         """
-        X, y = data.Xy
-
         self.nfeats = X.shape[1]
         self.feat_ranges = _get_features_range(X)
+        n_classes = len(np.unique(y))
 
         self.validation_distribution = np.asarray(
-            [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
+            [self.__get_distributions(X[y==cat]) for cat in range(n_classes)]
         )
 
         return self
 
-    def quantify(self, instances):
+    def predict(self, X):
         """
         Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
         (the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
         The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice)
         between all feature-specific discrete distributions.
 
-        :param instances: instances in the sample
+        :param X: instances in the sample
         :return: a vector of class prevalence estimates
         """
 
-        assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}'
+        assert X.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {X.shape[1]}'
 
-        test_distribution = self.__get_distributions(instances)
+        test_distribution = self.__get_distributions(X)
         divergence = get_divergence(self.divergence)
         n_classes, n_feats, nbins = self.validation_distribution.shape
         def loss(prev):
@@ -147,53 +149,53 @@ class DMx(BaseQuantifier):
         return F.argmin_prevalence(loss, n_classes, method=self.search)
 
 
-class ReadMe(BaseQuantifier):
-
-    def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
-        raise NotImplementedError('under development ...')
-        self.bootstrap_trials = bootstrap_trials
-        self.bootstrap_range = bootstrap_range
-        self.bagging_trials = bagging_trials
-        self.bagging_range = bagging_range
-        self.vectorizer_kwargs = vectorizer_kwargs
-
-    def fit(self, data: LabelledCollection):
-        X, y = data.Xy
-        self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
-        X = self.vectorizer.fit_transform(X)
-        self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
-
-    def quantify(self, instances):
-        X = self.vectorizer.transform(instances)
-
-        # number of features
-        num_docs, num_feats = X.shape
-
-        # bootstrap
-        p_boots = []
-        for _ in range(self.bootstrap_trials):
-            docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
-            class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
-            Xboot = X[docs_idx]
-
-            # bagging
-            p_bags = []
-            for _ in range(self.bagging_trials):
-                feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
-                class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
-                Xbag = Xboot[:,feat_idx]
-                p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
-                p_bags.append(p)
-            p_boots.append(np.mean(p_bags, axis=0))
-
-        p_mean = np.mean(p_boots, axis=0)
-        p_std  = np.std(p_bags, axis=0)
-
-        return p_mean
-
-
-    def std_constrained_linear_ls(self, X, class_cond_X: dict):
-        pass
+# class ReadMe(BaseQuantifier):
+#
+#     def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
+#         raise NotImplementedError('under development ...')
+#         self.bootstrap_trials = bootstrap_trials
+#         self.bootstrap_range = bootstrap_range
+#         self.bagging_trials = bagging_trials
+#         self.bagging_range = bagging_range
+#         self.vectorizer_kwargs = vectorizer_kwargs
+#
+#     def fit(self, data: LabelledCollection):
+#         X, y = data.Xy
+#         self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
+#         X = self.vectorizer.fit_transform(X)
+#         self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
+#
+#     def predict(self, X):
+#         X = self.vectorizer.transform(X)
+#
+#         # number of features
+#         num_docs, num_feats = X.shape
+#
+#         # bootstrap
+#         p_boots = []
+#         for _ in range(self.bootstrap_trials):
+#             docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
+#             class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
+#             Xboot = X[docs_idx]
+#
+#             # bagging
+#             p_bags = []
+#             for _ in range(self.bagging_trials):
+#                 feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
+#                 class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
+#                 Xbag = Xboot[:,feat_idx]
+#                 p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
+#                 p_bags.append(p)
+#             p_boots.append(np.mean(p_bags, axis=0))
+#
+#         p_mean = np.mean(p_boots, axis=0)
+#         p_std  = np.std(p_bags, axis=0)
+#
+#         return p_mean
+#
+#
+#     def std_constrained_linear_ls(self, X, class_cond_X: dict):
+#         pass
 
 
 def _get_features_range(X):
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index ad2825f..0937fa8 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -86,14 +86,14 @@ class GridSearchQ(BaseQuantifier):
         self.n_jobs = qp._get_njobs(n_jobs)
         self.raise_errors = raise_errors
         self.verbose = verbose
-        self.__check_error(error)
+        self.__check_error_measure(error)
         assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
 
     def _sout(self, msg):
         if self.verbose:
             print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
 
-    def __check_error(self, error):
+    def __check_error_measure(self, error):
         if error in qp.error.QUANTIFICATION_ERROR:
             self.error = error
         elif isinstance(error, str):
@@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier):
 
         def job(cls_params):
             model.set_params(**cls_params)
-            predictions = model.classifier_fit_predict(self._training)
+            predictions = model.classifier_fit_predict(self._training_X, self._training_y)
             return predictions
 
         predictions, status, took = self._error_handler(job, cls_params)
@@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier):
 
         def job(q_params):
             model.set_params(**q_params)
-            model.aggregation_fit(predictions, self._training)
+            P, y = predictions
+            model.aggregation_fit(P, y)
             score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
             return score
 
@@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier):
 
         def job(params):
             model.set_params(**params)
-            model.fit(self._training)
+            model.fit(self._training_X, self._training_y)
             score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
             return score
 
@@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier):
             return False
         return True
 
-    def _compute_scores_aggregative(self, training):
+    def _compute_scores_aggregative(self, X, y):
         # break down the set of hyperparameters into two: classifier-specific, quantifier-specific
         cls_configs, q_configs = group_params(self.param_grid)
 
         # train all classifiers and get the predictions
-        self._training = training
+        self._training_X = X
+        self._training_y = y
         cls_outs = qp.util.parallel(
             self._prepare_classifier,
             cls_configs,
             seed=qp.environ.get('_R_SEED', None),
-            n_jobs=self.n_jobs
+            n_jobs=self.n_jobs,
+            asarray=False
         )
 
         # filter out classifier configurations that yielded any error
@@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier):
 
         return aggr_outs
 
-    def _compute_scores_nonaggregative(self, training):
+    def _compute_scores_nonaggregative(self, X, y):
         configs = expand_grid(self.param_grid)
-        self._training = training
+        self._training_X = X
+        self._training_y = y
         scores = qp.util.parallel(
             self._prepare_nonaggr_model,
             configs,
@@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier):
         else:
             self._sout(f'error={status}')
 
-    def fit(self, training: LabelledCollection):
+    def fit(self, X, y):
         """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
             the error metric.
 
-        :param training: the training set on which to optimize the hyperparameters
+        :param X: array-like, training covariates
+        :param y: array-like, labels of training data
         :return: self
         """
 
@@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier):
 
         self._sout(f'starting model selection with n_jobs={self.n_jobs}')
         if self._break_down_fit():
-            results = self._compute_scores_aggregative(training)
+            results = self._compute_scores_aggregative(X, y)
         else:
-            results = self._compute_scores_nonaggregative(training)
+            results = self._compute_scores_nonaggregative(X, y)
 
         self.param_scores_ = {}
         self.best_score_ = None
@@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier):
             if isinstance(self.protocol, OnLabelledCollectionProtocol):
                 tinit = time()
                 self._sout(f'refitting on the whole development set')
-                self.best_model_.fit(training + self.protocol.get_labelled_collection())
+                validation_collection = self.protocol.get_labelled_collection()
+                training_collection = LabelledCollection(X, y, classes=validation_collection.classes)
+                devel_collection = training_collection + validation_collection
+                self.best_model_.fit(*devel_collection.Xy)
                 tend = time() - tinit
                 self.refit_time_ = tend
             else:
@@ -275,15 +283,15 @@ class GridSearchQ(BaseQuantifier):
 
         return self
 
-    def quantify(self, instances):
+    def predict(self, X):
         """Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
 
-        :param instances: sample contanining the instances
+        :param X: sample contanining the instances
         :return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
             by the model selection process.
         """
         assert hasattr(self, 'best_model_'), 'quantify called before fit'
-        return self.best_model().quantify(instances)
+        return self.best_model().predict(X)
 
     def set_params(self, **parameters):
         """Sets the hyper-parameters to explore.
@@ -364,8 +372,8 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
     total_prev = np.zeros(shape=data.n_classes)
 
     for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
-        quantifier.fit(train)
-        fold_prev = quantifier.quantify(test.X)
+        quantifier.fit(*train.Xy)
+        fold_prev = quantifier.predict(test.X)
         rel_size = 1. * len(test) / len(data)
         total_prev += fold_prev*rel_size
 
diff --git a/quapy/plot.py b/quapy/plot.py
index 4d9b896..5c5e4b4 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -23,21 +23,29 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
     indicating which class is to be taken as the positive class. (For multiclass quantification problems, other plots
     like the :meth:`error_by_drift` might be preferable though).
 
+    The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same
+    length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and
+    `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on
+    different datasets can be used, in which case the method name can appear more than once in `method_names`.
+
     :param method_names: array-like with the method names for each experiment
-    :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
-        each experiment
-    :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
-        for each experiment
-    :param pos_class: index of the positive class
-    :param title: the title to be displayed in the plot
-    :param show_std: whether or not to show standard deviations (represented by color bands). This might be inconvenient
+    :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components.
+    :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in
+        `true_prevs`.
+    :param pos_class: index of the positive class (default 1)
+    :param title: the title to be displayed in the plot (default None)
+    :param show_std: whether to show standard deviations (represented by color bands). This might be inconvenient
         for cases in which many methods are compared, or when the standard deviations are high -- default True)
-    :param legend: whether or not to display the leyend (default True)
-    :param train_prev: if indicated (default is None), the training prevalence (for the positive class) is hightlighted
-        in the plot. This is convenient when all the experiments have been conducted in the same dataset.
+    :param legend: whether to display the legend (default True)
+    :param train_prev: if indicated (default is None), the training prevalence (for the positive class) is highlighted
+        in the plot. This is convenient when all the experiments have been conducted in the same dataset, or in
+        datasets with the same training prevalence.
     :param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
     :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
         listed in the legend and associated with matplotlib colors).
+    :return: returns (fig, ax) matplotlib objects for eventual customisation
     """
     fig, ax = plt.subplots()
     ax.set_aspect('equal')
@@ -78,13 +86,9 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
 
     if legend:
         ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-        # box = ax.get_position()
-        # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
-        # ax.legend(loc='lower center',
-        #           bbox_to_anchor=(1, -0.5),
-        #           ncol=(len(method_names)+1)//2)
 
     _save_or_show(savepath)
+    return fig, ax
 
 
 def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
@@ -92,14 +96,21 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
     Box-plots displaying the global bias (i.e., signed error computed as the estimated value minus the true value)
     for each quantification method with respect to a given positive class.
 
+    The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same
+    length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and
+    `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on
+    different datasets can be used, in which case the method name can appear more than once in `method_names`.
+
     :param method_names: array-like with the method names for each experiment
-    :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
-        each experiment
-    :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
-        for each experiment
+    :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components.
+    :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in
+        `true_prevs`.
     :param pos_class: index of the positive class
-    :param title: the title to be displayed in the plot
+    :param title: the title to be displayed in the plot (default None)
     :param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
+    :return: returns (fig, ax) matplotlib objects for eventual customisation
     """
 
     method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
@@ -120,25 +131,34 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
 
     _save_or_show(savepath)
 
+    return fig, ax
+
 
 def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
                      vertical_xticks=False, legend=True, savepath=None):
     """
     Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value)
-    for different bins of (true) prevalence of the positive classs, for each quantification method.
+    for different bins of (true) prevalence of the positive class, for each quantification method.
+
+    The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same
+    length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and
+    `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on
+    different datasets can be used, in which case the method name can appear more than once in `method_names`.
 
     :param method_names: array-like with the method names for each experiment
-    :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
-        each experiment
-    :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
-        for each experiment
+    :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components.
+    :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in
+        `true_prevs`.
     :param pos_class: index of the positive class
-    :param title: the title to be displayed in the plot
-    :param nbins: number of bins
+    :param title: the title to be displayed in the plot (default None)
+    :param nbins: number of bins (default 5)
     :param colormap: the matplotlib colormap to use (default cm.tab10)
     :param vertical_xticks: whether or not to add secondary grid (default is False)
     :param legend: whether or not to display the legend (default is True)
     :param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
+    :return: returns (fig, ax) matplotlib objects for eventual customisation
     """
     from pylab import boxplot, plot, setp
 
@@ -210,13 +230,15 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
 
     _save_or_show(savepath)
 
+    return fig, ax
+
 
 def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
                    n_bins=20, error_name='ae', show_std=False,
                    show_density=True,
                    show_legend=True,
                    logscale=False,
-                   title=f'Quantification error as a function of distribution shift',
+                   title=None,
                    vlines=None,
                    method_order=None,
                    savepath=None):
@@ -227,11 +249,17 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
     fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the
     high-shift regime).
 
+    The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same
+    length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and
+    `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on
+    different datasets can be used, in which case the method name can appear more than once in `method_names`.
+
     :param method_names: array-like with the method names for each experiment
-    :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
-        each experiment
-    :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
-        for each experiment
+    :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components.
+    :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in
+        `true_prevs`.
     :param tr_prevs: training prevalence of each experiment
     :param n_bins: number of bins in which the y-axis is to be divided (default is 20)
     :param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae")
@@ -239,12 +267,13 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
     :param show_density: whether or not to display the distribution of experiments for each bin (default is True)
     :param show_density: whether or not to display the legend of the chart (default is True)
     :param logscale: whether or not to log-scale the y-error measure (default is False)
-    :param title: title of the plot (default is "Quantification error as a function of distribution shift")
+    :param title: title of the plot (default is None)
     :param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space
         using vertical dotted lines.
     :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
         listed in the legend and associated with matplotlib colors).
     :param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
+    :return: returns (fig, ax) matplotlib objects for eventual customisation
     """
 
     fig, ax = plt.subplots()
@@ -253,14 +282,14 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
     x_error = qp.error.ae
     y_error = getattr(qp.error, error_name)
 
+    if method_order is None:
+        method_order = []
+
     # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
     # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
     # x_error function) and 'y' is the estim-test shift (computed as according to y_error)
     data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
 
-    if method_order is None:
-        method_order = method_names
-
     _set_colors(ax, n_methods=len(method_order))
 
     bins = np.linspace(0, 1, n_bins+1)
@@ -313,11 +342,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
         ax2.spines['right'].set_color('g')
         ax2.tick_params(axis='y', colors='g')
     
-    ax.set(xlabel=f'Distribution shift between training set and test sample',
-           ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
+    ax.set(xlabel=f'Prior shift between training set and test sample',
+           ylabel=f'{error_name.upper()} (true prev, predicted prev)',
            title=title)
-    box = ax.get_position()
-    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
     if vlines:
         for vline in vlines:
             ax.axvline(vline, 0, 1, linestyle='--', color='k')
@@ -327,14 +356,15 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
         #nice scale for the logaritmic axis
         ax.set_ylim(0,10 ** math.ceil(math.log10(max_y)))
     
-    
     if show_legend:
-        fig.legend(loc='lower center',
+        fig.legend(loc='center left',
                   bbox_to_anchor=(1, 0.5),
-                  ncol=(len(method_names)+1)//2)
-      
+                  ncol=1)
+
     _save_or_show(savepath)
 
+    return fig, ax
+
 
 def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
                                  n_bins=20, binning='isomerous',
@@ -350,11 +380,17 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
     plot is displayed on top, that displays the distribution of experiments for each bin (when binning="isometric") or
     the percentiles points of the distribution (when binning="isomerous").
 
+    The format convention is as follows: `method_names`, `true_prevs`, and `estim_prevs` are array-like of the same
+    length, with the ith element describing the output of an independent experiment. The elements of `true_prevs`, and
+    `estim_prevs` are `ndarrays` with coherent shape for the same experiment. Experiments for the same method on
+    different datasets can be used, in which case the method name can appear more than once in `method_names`.
+
     :param method_names: array-like with the method names for each experiment
-    :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
-        each experiment
-    :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
-        for each experiment
+    :param true_prevs: array-like with the true prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components.
+    :param estim_prevs: array-like with the estimated prevalence values for each experiment. Each entry is a ndarray of
+        shape `(n_samples, n_classes)` components and `n_samples` must coincide with the corresponding entry in
+        `true_prevs`.
     :param tr_prevs: training prevalence of each experiment
     :param n_bins: number of bins in which the y-axis is to be divided (default is 20)
     :param binning: type of binning, either "isomerous" (default) or "isometric"
@@ -371,13 +407,16 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
     :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
         listed in the legend and associated with matplotlib colors).
     :param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
-    :return:
+    :return: returns (fig, ax) matplotlib objects for eventual customisation
     """
     assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"'
 
     x_error = getattr(qp.error, x_error)
     y_error = getattr(qp.error, y_error)
 
+    if method_order is None:
+        method_order = []
+
     # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
     # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
     # x_error function) and 'y' is the estim-test shift (computed as according to y_error)
@@ -518,6 +557,8 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
 
     _save_or_show(savepath)
 
+    return fig, ax
+
 
 def _merge(method_names, true_prevs, estim_prevs):
     ndims = true_prevs[0].shape[1]
@@ -535,8 +576,9 @@ def _merge(method_names, true_prevs, estim_prevs):
 
 def _set_colors(ax, n_methods):
     NUM_COLORS = n_methods
-    cm = plt.get_cmap('tab20')
-    ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
+    if NUM_COLORS>10:
+        cm = plt.get_cmap('tab20')
+        ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
 
 
 def _save_or_show(savepath):
diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py
index a8587b2..cc09f16 100644
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@@ -17,8 +17,8 @@ class TestDatasets(unittest.TestCase):
     def _check_dataset(self, dataset):
         q = self.new_quantifier()
         print(f'testing method {q} in {dataset.name}...', end='')
-        q.fit(dataset.training)
-        estim_prevalences = q.quantify(dataset.test.instances)
+        q.fit(*dataset.training.Xy)
+        estim_prevalences = q.predict(dataset.test.instances)
         self.assertTrue(F.check_prevalence_vector(estim_prevalences))
         print(f'[done]')
 
@@ -26,7 +26,7 @@ class TestDatasets(unittest.TestCase):
         for X, p in gen():
             if vectorizer is not None:
                 X = vectorizer.transform(X)
-            estim_prevalences = q.quantify(X)
+            estim_prevalences = q.predict(X)
             self.assertTrue(F.check_prevalence_vector(estim_prevalences))
             max_samples_test -= 1
             if max_samples_test == 0:
@@ -52,18 +52,12 @@ class TestDatasets(unittest.TestCase):
 
     def test_UCIBinaryDataset(self):
         for dataset_name in UCI_BINARY_DATASETS:
-            try:
-                print(f'loading dataset {dataset_name}...', end='')
-                dataset = fetch_UCIBinaryDataset(dataset_name)
-                dataset.stats()
-                dataset.reduce()
-                print(f'[done]')
-                self._check_dataset(dataset)
-            except FileNotFoundError as fnfe:
-                if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
-                        'If this is the first time you attempt to load this dataset') > 0:
-                    print('The pageblocks.5 dataset requires some hand processing to be usable; skipping this test.')
-                    continue
+            print(f'loading dataset {dataset_name}...', end='')
+            dataset = fetch_UCIBinaryDataset(dataset_name)
+            dataset.stats()
+            dataset.reduce()
+            print(f'[done]')
+            self._check_dataset(dataset)
 
     def test_UCIMultiDataset(self):
         for dataset_name in UCI_MULTICLASS_DATASETS:
@@ -83,18 +77,18 @@ class TestDatasets(unittest.TestCase):
             return
 
         for dataset_name in LEQUA2022_VECTOR_TASKS:
-            print(f'loading dataset {dataset_name}...', end='')
+            print(f'LeQu2022: loading dataset {dataset_name}...', end='')
             train, gen_val, gen_test = fetch_lequa2022(dataset_name)
             train.stats()
             n_classes = train.n_classes
             train = train.sampling(100, *F.uniform_prevalence(n_classes))
             q = self.new_quantifier()
-            q.fit(train)
+            q.fit(*train.Xy)
             self._check_samples(gen_val, q, max_samples_test=5)
             self._check_samples(gen_test, q, max_samples_test=5)
 
         for dataset_name in LEQUA2022_TEXT_TASKS:
-            print(f'loading dataset {dataset_name}...', end='')
+            print(f'LeQu2022: loading dataset {dataset_name}...', end='')
             train, gen_val, gen_test = fetch_lequa2022(dataset_name)
             train.stats()
             n_classes = train.n_classes
@@ -102,10 +96,26 @@ class TestDatasets(unittest.TestCase):
             tfidf = TfidfVectorizer()
             train.instances = tfidf.fit_transform(train.instances)
             q = self.new_quantifier()
-            q.fit(train)
+            q.fit(*train.Xy)
             self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf)
             self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf)
 
+    def test_lequa2024(self):
+        if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
+            print("omitting test_lequa2024 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set")
+            return
+
+        for task in LEQUA2024_TASKS:
+            print(f'LeQu2024: loading task {task}...', end='')
+            train, gen_val, gen_test = fetch_lequa2024(task, merge_T3=True)
+            train.stats()
+            n_classes = train.n_classes
+            train = train.sampling(100, *F.uniform_prevalence(n_classes))
+            q = self.new_quantifier()
+            q.fit(*train.Xy)
+            self._check_samples(gen_val, q, max_samples_test=5)
+            self._check_samples(gen_test, q, max_samples_test=5)
+
 
     def test_IFCB(self):
         if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
index 137a38a..05d661a 100644
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@@ -29,7 +29,7 @@ class EvalTestCase(unittest.TestCase):
                 time.sleep(1)
                 return super().predict_proba(X)
 
-        emq = EMQ(SlowLR()).fit(train)
+        emq = EMQ(SlowLR()).fit(*train.Xy)
 
         tinit = time()
         score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
@@ -41,14 +41,14 @@ class EvalTestCase(unittest.TestCase):
             def __init__(self, cls):
                 self.emq = EMQ(cls)
 
-            def quantify(self, instances):
-                return self.emq.quantify(instances)
+            def predict(self, X):
+                return self.emq.predict(X)
 
-            def fit(self, data):
-                self.emq.fit(data)
+            def fit(self, X, y):
+                self.emq.fit(X, y)
                 return self
 
-        emq = NonAggregativeEMQ(SlowLR()).fit(train)
+        emq = NonAggregativeEMQ(SlowLR()).fit(*train.Xy)
 
         tinit = time()
         score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
@@ -69,7 +69,7 @@ class EvalTestCase(unittest.TestCase):
 
         protocol = qp.protocol.APP(test, random_state=0)
 
-        q = PCC(LogisticRegression()).fit(train)
+        q = PCC(LogisticRegression()).fit(*train.Xy)
 
         single_errors = list(QUANTIFICATION_ERROR_SINGLE_NAMES)
         averaged_errors = ['m'+e for e in single_errors]
diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py
index cf5bf39..533bf1a 100644
--- a/quapy/tests/test_methods.py
+++ b/quapy/tests/test_methods.py
@@ -10,14 +10,17 @@ from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_ME
 from quapy.functional import check_prevalence_vector
 
 # a random selection of composed methods to test the qunfold integration
+from quapy.method.composable import check_compatible_qunfold_version
+
 from quapy.method.composable import (
     ComposableQuantifier,
     LeastSquaresLoss,
     HellingerSurrogateLoss,
     ClassTransformer,
     HistogramTransformer,
-    CVClassifier,
+    CVClassifier
 )
+
 COMPOSABLE_METHODS = [
     ComposableQuantifier( # ACC
         LeastSquaresLoss(),
@@ -48,10 +51,10 @@ class TestMethods(unittest.TestCase):
                     print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}')
                     continue
 
-                q = model(learner)
+                q = model(learner, fit_classifier=False)
                 print('testing', q)
-                q.fit(dataset.training, fit_classifier=False)
-                estim_prevalences = q.quantify(dataset.test.X)
+                q.fit(*dataset.training.Xy)
+                estim_prevalences = q.predict(dataset.test.X)
                 self.assertTrue(check_prevalence_vector(estim_prevalences))
 
     def test_non_aggregative(self):
@@ -64,12 +67,11 @@ class TestMethods(unittest.TestCase):
 
                 q = model()
                 print(f'testing {q} on dataset {dataset.name}')
-                q.fit(dataset.training)
-                estim_prevalences = q.quantify(dataset.test.X)
+                q.fit(*dataset.training.Xy)
+                estim_prevalences = q.predict(dataset.test.X)
                 self.assertTrue(check_prevalence_vector(estim_prevalences))
 
     def test_ensembles(self):
-
         qp.environ['SAMPLE_SIZE'] = 10
 
         base_quantifier = ACC(LogisticRegression())
@@ -80,8 +82,8 @@ class TestMethods(unittest.TestCase):
 
             print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
             ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
-            ensemble.fit(dataset.training)
-            estim_prevalences = ensemble.quantify(dataset.test.instances)
+            ensemble.fit(*dataset.training.Xy)
+            estim_prevalences = ensemble.predict(dataset.test.instances)
             self.assertTrue(check_prevalence_vector(estim_prevalences))
 
     def test_quanet(self):
@@ -106,17 +108,23 @@ class TestMethods(unittest.TestCase):
         from quapy.method.meta import QuaNet
         model = QuaNet(learner, device='cpu', n_epochs=2, tr_iter_per_poch=10, va_iter_per_poch=10, patience=2)
 
-        model.fit(dataset.training)
-        estim_prevalences = model.quantify(dataset.test.instances)
+        model.fit(*dataset.training.Xy)
+        estim_prevalences = model.predict(dataset.test.instances)
         self.assertTrue(check_prevalence_vector(estim_prevalences))
 
     def test_composable(self):
-        for dataset in TestMethods.datasets:
-            for q in COMPOSABLE_METHODS:
-                print('testing', q)
-                q.fit(dataset.training)
-                estim_prevalences = q.quantify(dataset.test.X)
-                self.assertTrue(check_prevalence_vector(estim_prevalences))
+        from packaging.version import Version
+        if check_compatible_qunfold_version():
+            for dataset in TestMethods.datasets:
+                for q in COMPOSABLE_METHODS:
+                    print('testing', q)
+                    q.fit(*dataset.training.Xy)
+                    estim_prevalences = q.predict(dataset.test.X)
+                    print(estim_prevalences)
+                    self.assertTrue(check_prevalence_vector(estim_prevalences))
+        else:
+            from quapy.method.composable import __old_version_message
+            print(__old_version_message)
 
 
 if __name__ == '__main__':
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
index 36b35ca..c13b665 100644
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@@ -26,7 +26,7 @@ class ModselTestCase(unittest.TestCase):
         app = APP(validation, sample_size=100, random_state=1)
         q = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, verbose=True, n_jobs=-1
-        ).fit(training)
+        ).fit(*training.Xy)
         print('best params', q.best_params_)
         print('best score', q.best_score_)
 
@@ -51,7 +51,7 @@ class ModselTestCase(unittest.TestCase):
         tinit = time.time()
         modsel = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
-        ).fit(training)
+        ).fit(*training.Xy)
         tend_seq = time.time()-tinit
         best_c_seq = modsel.best_params_['classifier__C']
         print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}')
@@ -60,7 +60,7 @@ class ModselTestCase(unittest.TestCase):
         tinit = time.time()
         modsel = GridSearchQ(
             q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
-        ).fit(training)
+        ).fit(*training.Xy)
         tend_par = time.time() - tinit
         best_c_par = modsel.best_params_['classifier__C']
         print(f'[done] took {tend_par:.2f}s best C = {best_c_par}')
@@ -90,7 +90,7 @@ class ModselTestCase(unittest.TestCase):
             q, param_grid, protocol=app, timeout=3, n_jobs=-1, verbose=True, raise_errors=True
         )
         with self.assertRaises(TimeoutError):
-            modsel.fit(training)
+            modsel.fit(*training.Xy)
 
         print('Expecting ValueError to be raised')
         modsel = GridSearchQ(
@@ -99,7 +99,7 @@ class ModselTestCase(unittest.TestCase):
         with self.assertRaises(ValueError):
             # this exception is not raised because of the timeout, but because no combination of hyperparams
             # succedded (in this case, a ValueError is raised, regardless of "raise_errors"
-            modsel.fit(training)
+            modsel.fit(*training.Xy)
 
 
 if __name__ == '__main__':
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
index 87bd358..4850bd4 100644
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@@ -71,7 +71,7 @@ class TestProtocols(unittest.TestCase):
         # surprisingly enough, for some n_prevalences the test fails, notwithstanding
         # everything is correct. The problem is that in function APP.prevalence_grid()
         # there is sometimes one rounding error that gets cumulated and
-        # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like)
+        # surpasses 1.0 (by a very small float value, 0.0000000000002 or the like)
         # so these tuples are mistakenly removed... I have tried with np.close, and
         # other workarounds, but eventually happens that there is some negative probability
         # in the sampling function...
diff --git a/quapy/tests/test_replicability.py b/quapy/tests/test_replicability.py
index 434d44b..a174992 100644
--- a/quapy/tests/test_replicability.py
+++ b/quapy/tests/test_replicability.py
@@ -13,17 +13,18 @@ class TestReplicability(unittest.TestCase):
     def test_prediction_replicability(self):
 
         dataset = qp.datasets.fetch_UCIBinaryDataset('yeast')
+        train, test = dataset.train_test
 
         with qp.util.temp_seed(0):
             lr = LogisticRegression(random_state=0, max_iter=10000)
             pacc = PACC(lr)
-            prev = pacc.fit(dataset.training).quantify(dataset.test.X)
+            prev = pacc.fit(*train.Xy).predict(test.X)
             str_prev1 = strprev(prev, prec=5)
 
         with qp.util.temp_seed(0):
             lr = LogisticRegression(random_state=0, max_iter=10000)
             pacc = PACC(lr)
-            prev2 = pacc.fit(dataset.training).quantify(dataset.test.X)
+            prev2 = pacc.fit(*train.Xy).predict(test.X)
             str_prev2 = strprev(prev2, prec=5)
 
         self.assertEqual(str_prev1, str_prev2)
@@ -83,19 +84,19 @@ class TestReplicability(unittest.TestCase):
         test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0])
 
         with qp.util.temp_seed(10):
-            pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
-            pacc.fit(train, val_split=0.5)
-            prev1 = F.strprev(pacc.quantify(test.instances))
+            pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2)
+            pacc.fit(*train.Xy)
+            prev1 = F.strprev(pacc.predict(test.instances))
 
         with qp.util.temp_seed(0):
-            pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
-            pacc.fit(train, val_split=0.5)
-            prev2 = F.strprev(pacc.quantify(test.instances))
+            pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2)
+            pacc.fit(*train.Xy)
+            prev2 = F.strprev(pacc.predict(test.instances))
 
         with qp.util.temp_seed(0):
-            pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
-            pacc.fit(train, val_split=0.5)
-            prev3 = F.strprev(pacc.quantify(test.instances))
+            pacc = PACC(LogisticRegression(), val_split=.5, n_jobs=2)
+            pacc.fit(*train.Xy)
+            prev3 = F.strprev(pacc.predict(test.instances))
 
         print(prev1)
         print(prev2)