From ad11b86168ba09d7cb876c3f9cdba7f4f299aaec Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Thu, 30 May 2024 10:53:53 +0200
Subject: [PATCH] adding environment variables for N_JOBS, and adding a default
 classifier (sklearn's logistic regression) for when the classifier is not
 specified in aggregative quantifiers

---
 CHANGE_LOG.txt                     | 24 ++++++++++---
 examples/0.basics.py               |  5 +--
 examples/1.model_selection.py      | 16 +++++----
 examples/8.ucimulti_experiments.py |  3 +-
 quapy/__init__.py                  | 24 +++++++++++--
 quapy/method/_kdey.py              | 55 +++++++++++++-----------------
 quapy/method/_threshold_optim.py   | 14 ++++----
 quapy/method/aggregative.py        | 42 +++++++++++------------
 quapy/model_selection.py           |  2 +-
 9 files changed, 108 insertions(+), 77 deletions(-)

diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt
index c3e1e64..8c99a6b 100644
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@@ -1,10 +1,26 @@
 Change Log 0.1.9
 ----------------
 - [TODO] add LeQua2024
-- [TODO] add njobs to env
-- [TODO] add basic examples
-- [TODO] add default classifier to env
-- [TODO] add default classifier to env
+
+- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
+    the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
+    sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone
+    of said classifier is returned. E.g.:
+    > pacc = PACC()
+    is equivalent to:
+    > pacc = PACC(classifier=LogisticRegression(max_iter=3000))
+
+- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is
+    now accompanied by a textual description of the error
+
+- The number of parallel workers can now be set via an environment variable by running, e.g.:
+    > N_JOBS=10 python3 your_script.py
+    which has the same effect as writing the following code at the beginning of your_script.py:
+    > import quapy as qp
+    > qp.environ["N_JOBS"] = 10
+
+- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0)
+    to advanced topics (higher numbers)
 
 - Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community
 
diff --git a/examples/0.basics.py b/examples/0.basics.py
index ea3a7c1..6325434 100644
--- a/examples/0.basics.py
+++ b/examples/0.basics.py
@@ -33,9 +33,10 @@ import quapy.functional as F  # <- this module has some functional utilities, li
 print(f'training prevalence = {F.strprev(train.prevalence())}')
 
 # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
-classifier = LogisticRegression()
+# classifier = LogisticRegression()
 
-pacc = qp.method.aggregative.PACC(classifier)
+# pacc = qp.method.aggregative.PACC(classifier)
+pacc = qp.method.aggregative.PACC()
 
 print(f'training {pacc}')
 pacc.fit(train)
diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py
index 130b542..61b7087 100644
--- a/examples/1.model_selection.py
+++ b/examples/1.model_selection.py
@@ -1,10 +1,7 @@
 import quapy as qp
-from method._kdey import KDEyML
-from quapy.method.non_aggregative import DMx
-from quapy.protocol import APP, UPP
+from quapy.protocol import UPP
 from quapy.method.aggregative import DMy
 from sklearn.linear_model import LogisticRegression
-from examples.comparing_gridsearch import OLD_GridSearchQ
 import numpy as np
 from time import time
 
@@ -12,10 +9,15 @@ from time import time
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """
 
-model = DMy(LogisticRegression())
+model = DMy()
 
 qp.environ['SAMPLE_SIZE'] = 100
-qp.environ['N_JOBS'] = -1
+
+print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
+      f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
+      f'alternatively, you can set this variable within the script as:\n'
+      f'import quapy as qp\n'
+      f'qp.environ["N_JOBS"]=-1')
 
 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
 
@@ -42,7 +44,7 @@ with qp.util.temp_seed(0):
     # different configurations of the quantifier. In other words, quapy avoids to train
     # the classifier 7x7 times.
     param_grid = {
-        'classifier__C': np.logspace(-3,3,7),
+        'classifier__C': np.logspace(-3, 3, 7),
         'nbins': [2, 3, 4, 5, 10, 15, 20]
     }
 
diff --git a/examples/8.ucimulti_experiments.py b/examples/8.ucimulti_experiments.py
index 5193376..e2a8d97 100644
--- a/examples/8.ucimulti_experiments.py
+++ b/examples/8.ucimulti_experiments.py
@@ -7,7 +7,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 
 import quapy as qp
-from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.method.aggregative import PACC, EMQ
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
@@ -52,6 +52,7 @@ def load_timings(result_path):
     df = pd.read_csv(result_path+'.csv', sep='\t')
     return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
 
+
 if __name__ == '__main__':
 
     qp.environ['SAMPLE_SIZE'] = 500
diff --git a/quapy/__init__.py b/quapy/__init__.py
index 8e4ceb2..db34bbb 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -1,15 +1,18 @@
 """QuaPy module for quantification"""
+from sklearn.linear_model import LogisticRegression
+
 from quapy.data import datasets
 from . import error
 from . import data
 from . import functional
-# from . import method
+from . import method
 from . import evaluation
 from . import protocol
 from . import plot
 from . import util
 from . import model_selection
 from . import classification
+import os
 
 __version__ = '0.1.9'
 
@@ -20,7 +23,8 @@ environ = {
     'PAD_TOKEN': '[PAD]',
     'PAD_INDEX': 1,
     'SVMPERF_HOME': './svm_perf_quantification',
-    'N_JOBS': 1
+    'N_JOBS': int(os.getenv('N_JOBS', 1)),
+    'DEFAULT_CLS': LogisticRegression(max_iter=3000)
 }
 
 
@@ -48,3 +52,19 @@ def _get_sample_size(sample_size):
     if sample_size is None:
         raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
     return sample_size
+
+
+def _get_classifier(classifier):
+    """
+    If `classifier` is None, then it returns `environ['DEFAULT_CLS']`;
+    if otherwise, returns `classifier`.
+
+    :param classifier: sklearn's estimator or None
+    :return: sklearn's estimator
+    """
+    if classifier is None:
+        from sklearn.base import clone
+        classifier = clone(environ['DEFAULT_CLS'])
+    if classifier is None:
+        raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
+    return classifier
diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py
index e678563..e50c99c 100644
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@@ -24,12 +24,14 @@ class KDEBase:
         Checks that the bandwidth parameter is correct
 
         :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
-        :return: nothing, but raises an exception for invalid values
+        :return: the bandwidth if the check is passed, or raises an exception for invalid values
         """
         assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
             f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
         if isinstance(bandwidth, float):
-            assert 0 < bandwidth < 1,  "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+            assert 0 < bandwidth < 1,  \
+                "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+        return bandwidth
 
     def get_kde_function(self, X, bandwidth):
         """
@@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
         Alternatively, this set can be specified at fit time by indicating the exact set of data
         on which the predictions are to be generated.
     :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
-        self._check_bandwidth(bandwidth)
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
         self.random_state=random_state
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@@ -130,17 +129,17 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
         :param posteriors: instances in the sample converted into posterior probabilities
         :return: a vector of class prevalence estimates
         """
-        np.random.RandomState(self.random_state)
-        epsilon = 1e-10
-        n_classes = len(self.mix_densities)
-        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
+        with qp.util.temp_seed(self.random_state):
+            epsilon = 1e-10
+            n_classes = len(self.mix_densities)
+            test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
 
-        def neg_loglikelihood(prev):
-            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
-            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
-            return  -np.sum(test_loglikelihood)
+            def neg_loglikelihood(prev):
+                test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
+                test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
+                return  -np.sum(test_loglikelihood)
 
-        return F.optim_minimize(neg_loglikelihood, n_classes)
+            return F.optim_minimize(neg_loglikelihood, n_classes)
 
 
 class KDEyHD(AggregativeSoftQuantifier, KDEBase):
@@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
         Alternatively, this set can be specified at fit time by indicating the exact set of data
         on which the predictions are to be generated.
     :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     :param montecarlo_trials: number of Monte Carlo trials (default 10000)
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
-                 bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
+                 bandwidth=0.1, random_state=None, montecarlo_trials=10000):
         
-        self._check_bandwidth(bandwidth)
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.divergence = divergence
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
         self.random_state=random_state
         self.montecarlo_trials = montecarlo_trials
 
@@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier):
         Alternatively, this set can be specified at fit time by indicating the exact set of data
         on which the predictions are to be generated.
     :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
-        KDEBase._check_bandwidth(bandwidth)
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
 
     def gram_matrix_mix_sum(self, X, Y=None):
         # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
@@ -355,7 +348,7 @@ class KDEyCS(AggregativeSoftQuantifier):
             # called \overline{r} in the paper
             alpha_ratio = alpha * self.counts_inv
 
-            # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
+            # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
             partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
             partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
             return partA + partB #+ partC
diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py
index a9d2723..72d0c95 100644
--- a/quapy/method/_threshold_optim.py
+++ b/quapy/method/_threshold_optim.py
@@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
 
@@ -143,7 +143,7 @@ class T50(ThresholdOptimization):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -167,7 +167,7 @@ class MAX(ThresholdOptimization):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -192,7 +192,7 @@ class X(ThresholdOptimization):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -215,7 +215,7 @@ class MS(ThresholdOptimization):
         `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -254,7 +254,7 @@ class MS2(MS):
         `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def discard(self, tpr, fpr) -> bool:
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 81165c2..c7a9dbd 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -3,7 +3,6 @@ from copy import deepcopy
 from typing import Callable, Literal, Union
 import numpy as np
 from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
-from scipy import optimize
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import confusion_matrix
@@ -12,7 +11,6 @@ from sklearn.model_selection import cross_val_predict
 import quapy as qp
 import quapy.functional as F
 from quapy.functional import get_divergence
-from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
 from quapy.classification.svmperf import SVMperf
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
@@ -343,8 +341,8 @@ class CC(AggregativeCrispQuantifier):
     :param classifier: a sklearn's Estimator that generates a classifier
     """
 
-    def __init__(self, classifier: BaseEstimator):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None):
+        self.classifier = qp._get_classifier(classifier)
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
         """
@@ -373,8 +371,8 @@ class PCC(AggregativeSoftQuantifier):
     :param classifier: a sklearn's Estimator that generates a classifier
     """
 
-    def __init__(self, classifier: BaseEstimator):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None):
+        self.classifier = qp._get_classifier(classifier)
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
         """
@@ -436,14 +434,14 @@ class ACC(AggregativeCrispQuantifier):
     """
     def __init__(
             self,
-            classifier: BaseEstimator,
+            classifier: BaseEstimator=None,
             val_split=5,
             solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
             method: Literal['inversion', 'invariant-ratio'] = 'inversion',
             norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
             n_jobs=None,
     ):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
         self.solver = solver
@@ -571,14 +569,14 @@ class PACC(AggregativeSoftQuantifier):
     """
     def __init__(
             self,
-            classifier: BaseEstimator,
+            classifier: BaseEstimator=None,
             val_split=5,
             solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
             method: Literal['inversion', 'invariant-ratio'] = 'inversion',
             norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
             n_jobs=None
     ):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
         self.solver = solver
@@ -668,8 +666,8 @@ class EMQ(AggregativeSoftQuantifier):
     MAX_ITER = 1000
     EPSILON = 1e-4
 
-    def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.exact_train_prev = exact_train_prev
         self.recalib = recalib
@@ -832,7 +830,7 @@ class BayesianCC(AggregativeCrispQuantifier):
     :param mcmc_seed: random seed for the MCMC sampler (default 0)
     """
     def __init__(self,
-                 classifier: BaseEstimator,
+                 classifier: BaseEstimator=None,
                  val_split: float = 0.75,
                  num_warmup: int = 500,
                  num_samples: int = 1_000,
@@ -849,7 +847,7 @@ class BayesianCC(AggregativeCrispQuantifier):
         if _bayesian.DEPENDENCIES_INSTALLED is False:
             raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
 
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.num_warmup = num_warmup
         self.num_samples = num_samples
@@ -919,8 +917,8 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
         validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@@ -995,8 +993,8 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
     :param n_jobs: number of parallel workers.
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.tol = tol
         self.divergence = divergence
@@ -1060,8 +1058,8 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
         validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
       
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@@ -1109,9 +1107,9 @@ class DMy(AggregativeSoftQuantifier):
     :param n_jobs: number of parallel workers (default None)
     """
 
-    def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
                  cdf=False, search='optim_minimize', n_jobs=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.nbins = nbins
         self.divergence = divergence
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 107f303..0c600ca 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -328,7 +328,7 @@ class GridSearchQ(BaseQuantifier):
             if self.raise_errors:
                 raise exception
             else:
-                return ConfigStatus(params, status)
+                return ConfigStatus(params, status, msg=str(exception))
 
         try:
             with timeout(self.timeout):