From 8876d311e784c0b6fae9af8ebb20300b0a718b8c Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Mon, 2 Dec 2024 17:39:06 +0100
Subject: [PATCH] working on SCMQ, MCSQ, MCMQ ensembles

---
 examples/16.KDEy_bandwidth.py | 83 +++++++++++++++++++++++++++++++++++
 examples/ensembles.py         | 26 +++++++++--
 quapy/functional.py           | 14 ++++--
 quapy/method/_kdey.py         | 63 ++++++++++++++++++++++++--
 quapy/method/meta.py          | 75 ++++++++++++++++++++++++-------
 quapy/model_selection.py      |  4 +-
 6 files changed, 236 insertions(+), 29 deletions(-)
 create mode 100644 examples/16.KDEy_bandwidth.py

diff --git a/examples/16.KDEy_bandwidth.py b/examples/16.KDEy_bandwidth.py
new file mode 100644
index 0000000..cc81ade
--- /dev/null
+++ b/examples/16.KDEy_bandwidth.py
@@ -0,0 +1,83 @@
+import quapy as qp
+import numpy as np
+from quapy.protocol import UPP
+from quapy.method.aggregative import KDEyML
+import quapy.functional as F
+from time import time
+
+"""
+Let see one example:  
+"""
+
+# load some data
+qp.environ['SAMPLE_SIZE'] = 100
+data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
+training, test = data.train_test
+training, validation = training.split_stratified(train_prop=0.7, random_state=0)
+protocol = UPP(validation)
+
+hyper_C = np.logspace(-3, 3, 7)
+
+model = KDEyML()
+
+with qp.util.temp_seed(0):
+
+    param_grid = {
+        'classifier__C': hyper_C,
+        'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20]
+    }
+
+    model = qp.model_selection.GridSearchQ(
+        model=model,
+        param_grid=param_grid,
+        protocol=protocol,
+        error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
+        refit=False,  # retrain on the whole labelled set once done
+        n_jobs=-1,
+        verbose=True  # show information as the process goes on
+    ).fit(training)
+
+best_params = model.best_params_
+took = model.fit_time_
+model = model.best_model_
+print(f'model selection ended: best hyper-parameters={best_params}')
+
+# evaluation in terms of MAE
+# we use the same evaluation protocol (APP) on the test set
+mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
+
+print(f'MAE={mae_score:.5f}')
+print(f'model selection took {took:.1f}s')
+
+
+model = KDEyML(bandwidth='auto')
+
+with qp.util.temp_seed(0):
+
+    param_grid = {
+        'classifier__C': hyper_C,
+    }
+
+    model = qp.model_selection.GridSearchQ(
+        model=model,
+        param_grid=param_grid,
+        protocol=protocol,
+        error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
+        refit=False,  # retrain on the whole labelled set once done
+        n_jobs=-1,
+        verbose=True  # show information as the process goes on
+    ).fit(training)
+
+best_params = model.best_params_
+took = model.fit_time_
+model = model.best_model_
+bandwidth = model.bandwidth_val
+print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})')
+
+# evaluation in terms of MAE
+# we use the same evaluation protocol (APP) on the test set
+mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
+
+print(f'MAE={mae_score:.5f}')
+print(f'model selection took {took:.1f}s')
+
diff --git a/examples/ensembles.py b/examples/ensembles.py
index 8c0d07e..84aeb2c 100644
--- a/examples/ensembles.py
+++ b/examples/ensembles.py
@@ -1,10 +1,16 @@
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.neighbors import KNeighborsClassifier
 from statsmodels.sandbox.distributions.genpareto import quant
 
 import quapy as qp
 from quapy.protocol import UPP
 from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
-from quapy.method.meta import SCMQ
+from quapy.method.meta import SCMQ, MCMQ, MCSQ
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=ConvergenceWarning)
 
 qp.environ["SAMPLE_SIZE"]=100
 
@@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers)
 
 train_and_test_model(scmq, train, test)
 
-for quantifier in quantifiers:
-    train_and_test_model(quantifier, train, test)
\ No newline at end of file
+# for quantifier in quantifiers:
+#     train_and_test_model(quantifier, train, test)
+
+classifiers = [
+    LogisticRegression(),
+    KNeighborsClassifier(),
+    # MultinomialNB()
+]
+
+mcmq = MCMQ(classifiers, quantifiers)
+
+train_and_test_model(mcmq, train, test)
+
+mcsq = MCSQ(classifiers, PACC())
+
+train_and_test_model(mcsq, train, test)
\ No newline at end of file
diff --git a/quapy/functional.py b/quapy/functional.py
index fd7a88f..fac6647 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -416,7 +416,7 @@ def argmin_prevalence(loss: Callable,
         raise NotImplementedError()
 
 
-def optim_minimize(loss: Callable, n_classes: int):
+def optim_minimize(loss: Callable, n_classes: int, return_loss=False):
     """
     Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
     that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
@@ -424,18 +424,24 @@ def optim_minimize(loss: Callable, n_classes: int):
 
     :param loss: (callable) the function to minimize
     :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
-    :return: (ndarray) the best prevalence vector found
+    :param return_loss: bool, if True, returns also the value of the loss (default is False).
+    :return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss
+        if return_loss=True
     """
     from scipy import optimize
 
     # the initial point is set as the uniform distribution
-    uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+    uniform_distribution = uniform_prevalence(n_classes=n_classes)
 
     # solutions are bounded to those contained in the unit-simplex
     bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
     constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
     r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-    return r.x
+    
+    if return_loss:
+        return r.x, r.fun
+    else:
+        return r.x
 
 
 def linear_search(loss: Callable, n_classes: int):
diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py
index e50c99c..a396324 100644
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@@ -1,5 +1,8 @@
 from typing import Union
 import numpy as np
+from scipy.optimize import optimize, minimize_scalar
+
+from quapy.protocol import UPP
 from sklearn.base import BaseEstimator
 from sklearn.neighbors import KernelDensity
 
@@ -111,16 +114,70 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     """
 
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None):
         self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
-        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
+        self.bandwidth = bandwidth
+        if bandwidth!='auto':
+            self.bandwidth = KDEBase._check_bandwidth(bandwidth)
+
+        assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \
+            (f'param {auto_reduction=} should either be None (no reduction) or a positive integer '
+             f'(number of training instances).')
+
+        self.auto_reduction = auto_reduction
+        self.auto_repeats = auto_repeats
         self.random_state=random_state
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
+        if self.bandwidth == 'auto':
+            self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions)
+        else:
+            self.bandwidth_val = self.bandwidth
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val)
         return self
 
+    def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
+        train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
+        n_classes = classif_predictions.n_classes
+        epsilon = 1e-8
+        repeats = self.auto_repeats
+
+        auto_reduction = self.auto_reduction
+        if auto_reduction is None:
+            auto_reduction = len(classif_predictions)
+        else:
+            # reduce samples to speed up computation
+            train = train.sampling(auto_reduction)
+
+        prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state)
+
+        def eval_bandwidth_nll(bandwidth):
+            mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
+            loss_accum = 0
+            for (sample, prevtrue) in prot():
+                test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
+
+                def neg_loglikelihood_prev(prev):
+                    test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
+                    test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
+                    nll = -np.sum(test_loglikelihood)
+                    return nll
+
+                pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True)
+                loss_accum += neglikelihood
+            return loss_accum
+
+        r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005})
+        best_band = r.x
+        best_loss_value = r.fun
+        nit = r.nit
+
+        # print(f'[{self.__class__.__name__}:autobandwidth] '
+        #       f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
+
+        return best_band
+
     def aggregate(self, posteriors: np.ndarray):
         """
         Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index 0fc8916..05dd06c 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -693,14 +693,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
     return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
 
 
+def merge(prev_predictions, merge_fun):
+    prev_predictions = np.asarray(prev_predictions)
+    if merge_fun == 'median':
+        prevalences = np.median(prev_predictions, axis=0)
+        prevalences = F.normalize_prevalence(prevalences, method='l1')
+    elif merge_fun == 'mean':
+        prevalences = np.mean(prev_predictions, axis=0)
+    else:
+        raise NotImplementedError(f'merge function {merge_fun} not implemented!')
+    return prevalences
+
+
 class SCMQ(AggregativeSoftQuantifier):
 
-    MERGE_FUNCTIONS = ['median']
+    MERGE_FUNCTIONS = ['median', 'mean']
 
     def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
         self.classifier = classifier
-        self.quantifiers = quantifiers
-        assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
+        self.quantifiers = [deepcopy(q) for q in quantifiers]
+        assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
         self.merge_fun = merge_fun
         self.val_split = val_split
 
@@ -715,22 +727,51 @@ class SCMQ(AggregativeSoftQuantifier):
         for quantifier_i in self.quantifiers:
             prevalence_i = quantifier_i.aggregate(classif_predictions)
             prev_predictions.append(prevalence_i)
-        return self.merge(prev_predictions)
-
-    def merge(self, prev_predictions):
-        prev_predictions = np.asarray(prev_predictions)
-        if self.merge_fun == 'median':
-            prevalences = np.median(prev_predictions, axis=0)
-            prevalences = F.normalize_prevalence(prevalences, method='l1')
-        elif self.merge_fun == 'mean':
-            prevalences = np.mean(prev_predictions, axis=0)
-        else:
-            raise NotImplementedError(f'merge function {self.merge_fun} not implemented!')
-        return prevalences
-
-
+        return merge(prev_predictions, merge_fun=self.merge_fun)
 
 
+class MCSQ(BaseQuantifier):
+    def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5):
+        self.merge_fun = merge_fun
+        self.val_split = val_split
+        self.mcsqs = []
+        for classifier in classifiers:
+            quantifier = deepcopy(quantifier)
+            quantifier.classifier = classifier
+            self.mcsqs.append(quantifier)
+
+    def fit(self, data: LabelledCollection):
+        for q in self.mcsqs:
+            q.fit(data, val_split=self.val_split)
+        return self
+
+    def quantify(self, instances):
+        prev_predictions = []
+        for q in self.mcsqs:
+            prevalence_i = q.quantify(instances)
+            prev_predictions.append(prevalence_i)
+        return merge(prev_predictions, merge_fun=self.merge_fun)
+
+
+class MCMQ(BaseQuantifier):
+    def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
+        self.merge_fun = merge_fun
+        self.scmqs = []
+        for classifier in classifiers:
+            self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split))
+
+    def fit(self, data: LabelledCollection):
+        for q in self.scmqs:
+            q.fit(data)
+        return self
+
+    def quantify(self, instances):
+        prev_predictions = []
+        for q in self.scmqs:
+            prevalence_i = q.quantify(instances)
+            prev_predictions.append(prevalence_i)
+        return merge(prev_predictions, merge_fun=self.merge_fun)
+
 
 
 
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 0c600ca..ad2825f 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -248,13 +248,13 @@ class GridSearchQ(BaseQuantifier):
                 self.param_scores_[str(params)] = status.status
                 self.error_collector.append(status)
 
-        tend = time()-tinit
+        self.fit_time_ = time()-tinit
 
         if self.best_score_ is None:
             raise ValueError('no combination of hyperparameters seemed to work')
 
         self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
-                   f'[took {tend:.4f}s]')
+                   f'[took {self.fit_time_:.4f}s]')
 
         no_errors = len(self.error_collector)
         if no_errors>0: