From 524ec37f8353c8fd2fefcd1c7d1eb60a3290ac36 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Tue, 12 Apr 2022 17:13:38 +0200
Subject: [PATCH] sample_size can now be set to None to indicate that the value
 has to be resolved by inspecting the environment variable SAMPLE_SIZE

---
 quapy/error.py           |  2 +-
 quapy/evaluation.py      | 60 +++++++++++++++++++++++-----------------
 quapy/model_selection.py |  8 ++++--
 quapy/util.py            | 10 +++++++
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/quapy/error.py b/quapy/error.py
index 3375470..a71ed46 100644
--- a/quapy/error.py
+++ b/quapy/error.py
@@ -215,7 +215,7 @@ def __check_eps(eps=None):
 
 
 CLASSIFICATION_ERROR = {f1e, acce}
-QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld}
+QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld, ae, rae, se, kld, nkld}
 QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, mkld, mnkld, mrae}
 CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
 QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index 0846ab0..936b83c 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -6,7 +6,7 @@ import inspect
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier
-from quapy.util import temp_seed
+from quapy.util import temp_seed, _check_sample_size
 import quapy.functional as F
 import pandas as pd
 
@@ -14,9 +14,9 @@ import pandas as pd
 def artificial_prevalence_prediction(
         model: BaseQuantifier,
         test: LabelledCollection,
-        sample_size,
+        sample_size=None,
         n_prevpoints=101,
-        repeats=1,
+        n_repetitions=1,
         eval_budget: int = None,
         n_jobs=1,
         random_seed=42,
@@ -31,10 +31,11 @@ def artificial_prevalence_prediction(
 
     :param model: the model in charge of generating the class prevalence estimations
     :param test: the test set on which to perform APP
-    :param sample_size: integer, the size of the samples
+    :param sample_size: integer, the size of the samples; if None, then the sample size is
+        taken from qp.environ['SAMPLE_SIZE']
     :param n_prevpoints: integer, the number of different prevalences to sample (or set to None if eval_budget
         is specified; default 101, i.e., steps of 1%)
-    :param repeats: integer, the number of repetitions for each prevalence (default 1)
+    :param n_repetitions: integer, the number of repetitions for each prevalence (default 1)
     :param eval_budget: integer, if specified, sets a ceil on the number of evaluations to perform. For example, if
         there are 3 classes, `repeats=1`, and `eval_budget=20`, then `n_prevpoints` will be set to 5, since this
         will generate 15 different prevalence vectors ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0]) and
@@ -48,10 +49,11 @@ def artificial_prevalence_prediction(
         for the samples generated while the second one contains the prevalence estimations
     """
 
-    n_prevpoints, _ = qp.evaluation._check_num_evals(test.n_classes, n_prevpoints, eval_budget, repeats, verbose)
+    sample_size = _check_sample_size(sample_size)
+    n_prevpoints, _ = qp.evaluation._check_num_evals(test.n_classes, n_prevpoints, eval_budget, n_repetitions, verbose)
 
     with temp_seed(random_seed):
-        indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, repeats))
+        indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
 
     return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
 
@@ -59,8 +61,8 @@ def artificial_prevalence_prediction(
 def natural_prevalence_prediction(
         model: BaseQuantifier,
         test: LabelledCollection,
-        sample_size,
-        repeats,
+        sample_size=None,
+        repeats=100,
         n_jobs=1,
         random_seed=42,
         verbose=False):
@@ -71,8 +73,9 @@ def natural_prevalence_prediction(
 
     :param model: the model in charge of generating the class prevalence estimations
     :param test: the test set on which to perform NPP
-    :param sample_size: integer, the size of the samples
-    :param repeats: integer, the number of samples to generate
+    :param sample_size: integer, the size of the samples; if None, then the sample size is
+        taken from qp.environ['SAMPLE_SIZE']
+    :param repeats: integer, the number of samples to generate (default 100)
     :param n_jobs: integer, number of jobs to be run in parallel (default 1)
     :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
         any other random process (default 42)
@@ -82,6 +85,7 @@ def natural_prevalence_prediction(
         for the samples generated while the second one contains the prevalence estimations
     """
 
+    sample_size = _check_sample_size(sample_size)
     with temp_seed(random_seed):
         indexes = list(test.natural_sampling_index_generator(sample_size, repeats))
 
@@ -162,9 +166,9 @@ def _predict_from_indexes(
 def artificial_prevalence_report(
         model: BaseQuantifier,
         test: LabelledCollection,
-        sample_size,
+        sample_size=None,
         n_prevpoints=101,
-        repeats=1,
+        n_repetitions=1,
         eval_budget: int = None,
         n_jobs=1,
         random_seed=42,
@@ -184,10 +188,11 @@ def artificial_prevalence_report(
 
     :param model: the model in charge of generating the class prevalence estimations
     :param test: the test set on which to perform APP
-    :param sample_size: integer, the size of the samples
+    :param sample_size: integer, the size of the samples; if None, then the sample size is
+        taken from qp.environ['SAMPLE_SIZE']
     :param n_prevpoints: integer, the number of different prevalences to sample (or set to None if eval_budget
         is specified; default 101, i.e., steps of 1%)
-    :param repeats: integer, the number of repetitions for each prevalence (default 1)
+    :param n_repetitions: integer, the number of repetitions for each prevalence (default 1)
     :param eval_budget: integer, if specified, sets a ceil on the number of evaluations to perform. For example, if
         there are 3 classes, `repeats=1`, and `eval_budget=20`, then `n_prevpoints` will be set to 5, since this
         will generate 15 different prevalence vectors ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0]) and
@@ -205,7 +210,7 @@ def artificial_prevalence_report(
     """
 
     true_prevs, estim_prevs = artificial_prevalence_prediction(
-        model, test, sample_size, n_prevpoints, repeats, eval_budget, n_jobs, random_seed, verbose
+        model, test, sample_size, n_prevpoints, n_repetitions, eval_budget, n_jobs, random_seed, verbose
     )
     return _prevalence_report(true_prevs, estim_prevs, error_metrics)
 
@@ -213,8 +218,8 @@ def artificial_prevalence_report(
 def natural_prevalence_report(
         model: BaseQuantifier,
         test: LabelledCollection,
-        sample_size,
-        repeats=1,
+        sample_size=None,
+        repeats=100,
         n_jobs=1,
         random_seed=42,
         error_metrics:Iterable[Union[str,Callable]]='mae',
@@ -230,8 +235,9 @@ def natural_prevalence_report(
 
     :param model: the model in charge of generating the class prevalence estimations
     :param test: the test set on which to perform NPP
-    :param sample_size: integer, the size of the samples
-    :param repeats: integer, the number of samples to generate
+    :param sample_size: integer, the size of the samples; if None, then the sample size is
+        taken from qp.environ['SAMPLE_SIZE']
+    :param repeats: integer, the number of samples to generate (default 100)
     :param n_jobs: integer, number of jobs to be run in parallel (default 1)
     :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
         any other random process (default 42)
@@ -244,7 +250,7 @@ def natural_prevalence_report(
         for the samples generated while the second one contains the prevalence estimations
 
     """
-
+    sample_size = _check_sample_size(sample_size)
     true_prevs, estim_prevs = natural_prevalence_prediction(
         model, test, sample_size, repeats, n_jobs, random_seed, verbose
     )
@@ -300,7 +306,7 @@ def _prevalence_report(
 def artificial_prevalence_protocol(
         model: BaseQuantifier,
         test: LabelledCollection,
-        sample_size,
+        sample_size=None,
         n_prevpoints=101,
         repeats=1,
         eval_budget: int = None,
@@ -318,7 +324,8 @@ def artificial_prevalence_protocol(
 
     :param model: the model in charge of generating the class prevalence estimations
     :param test: the test set on which to perform APP
-    :param sample_size: integer, the size of the samples
+    :param sample_size: integer, the size of the samples; if None, then the sample size is
+        taken from qp.environ['SAMPLE_SIZE']
     :param n_prevpoints: integer, the number of different prevalences to sample (or set to None if eval_budget
         is specified; default 101, i.e., steps of 1%)
     :param repeats: integer, the number of repetitions for each prevalence (default 1)
@@ -350,8 +357,8 @@ def artificial_prevalence_protocol(
 def natural_prevalence_protocol(
         model: BaseQuantifier,
         test: LabelledCollection,
-        sample_size,
-        repeats=1,
+        sample_size=None,
+        repeats=100,
         n_jobs=1,
         random_seed=42,
         error_metric:Union[str,Callable]='mae',
@@ -363,7 +370,8 @@ def natural_prevalence_protocol(
 
     :param model: the model in charge of generating the class prevalence estimations
     :param test: the test set on which to perform NPP
-    :param sample_size: integer, the size of the samples
+    :param sample_size: integer, the size of the samples; if None, then the sample size is
+        taken from qp.environ['SAMPLE_SIZE']
     :param repeats: integer, the number of samples to generate
     :param n_jobs: integer, number of jobs to be run in parallel (default 1)
     :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 5af4b2f..86e79f3 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -11,6 +11,8 @@ from quapy.evaluation import artificial_prevalence_prediction, natural_prevalenc
 from quapy.method.aggregative import BaseQuantifier
 import inspect
 
+from util import _check_sample_size
+
 
 class GridSearchQ(BaseQuantifier):
     """Grid Search optimization targeting a quantification-oriented metric.
@@ -57,7 +59,7 @@ class GridSearchQ(BaseQuantifier):
     def __init__(self,
                  model: BaseQuantifier,
                  param_grid: dict,
-                 sample_size: Union[int, None],
+                 sample_size: Union[int, None] = None,
                  protocol='app',
                  n_prevpoints: int = None,
                  n_repetitions: int = 1,
@@ -105,7 +107,7 @@ class GridSearchQ(BaseQuantifier):
             return training, validation
         elif isinstance(validation, float):
             assert 0. < validation < 1., 'validation proportion should be in (0,1)'
-            training, validation = training.split_stratified(train_prop=1 - validation)
+            training, validation = training.split_stratified(train_prop=1 - validation, random_state=self.random_seed)
             return training, validation
         elif self.protocol=='gen' and inspect.isgenerator(validation()):
             return training, validation
@@ -163,7 +165,7 @@ class GridSearchQ(BaseQuantifier):
             val_split = self.val_split
         training, val_split = self.__check_training_validation(training, val_split)
         if self.protocol != 'gen':
-            assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
+            self.sample_size = _check_sample_size(self.sample_size)
 
         params_keys = list(self.param_grid.keys())
         params_values = list(self.param_grid.values())
diff --git a/quapy/util.py b/quapy/util.py
index 9d44633..12ffc23 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -176,6 +176,16 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
             return instance
 
 
+def _check_sample_size(sample_size):
+    if sample_size is None:
+        assert qp.environ['SAMPLE_SIZE'] is not None, \
+            'error: sample_size set to None, and cannot be resolved from the environment'
+        sample_size = qp.environ['SAMPLE_SIZE']
+    assert isinstance(sample_size, int) and sample_size > 0, \
+        'error: sample_size is not a positive integer'
+    return sample_size
+
+
 class EarlyStop:
     """
     A class implementing the early-stopping condition typically used for training neural networks.