From eb860e9678c396d5ce5bcba703fb8e36a4ad0403 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Mon, 12 Dec 2022 09:34:09 +0100
Subject: [PATCH] adding the possibility to estimate the training prevalence,
 instead of using the true training prevalence, as a starting point in emq

---
 examples/lequa2022_experiments.py | 13 ++++++++++---
 quapy/method/aggregative.py       | 12 ++++++++++--
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py
index 0df7d15..31ec651 100644
--- a/examples/lequa2022_experiments.py
+++ b/examples/lequa2022_experiments.py
@@ -1,6 +1,8 @@
 import numpy as np
+from sklearn.calibration import CalibratedClassifierCV
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
+import quapy.functional as F
 from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
 from evaluation import evaluation_report
 from method.aggregative import EMQ
@@ -14,7 +16,8 @@ qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task]
 training, val_generator, test_generator = fetch_lequa2022(task=task)
 
 # define the quantifier
-quantifier = EMQ(learner=LogisticRegression())
+learner = CalibratedClassifierCV(LogisticRegression())
+quantifier = EMQ(learner=learner)
 
 # model selection
 param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
@@ -24,6 +27,10 @@ quantifier = model_selection.fit(training)
 # evaluation
 report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
 
-pd.set_option('display.max_columns', None)
-pd.set_option('display.width', 1000)
+# printing results
+pd.set_option('display.expand_frame_repr', False)
+report['estim-prev'] = report['estim-prev'].map(F.strprev)
 print(report)
+
+print('Averaged values:')
+print(report.mean())
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 19d365b..202b5dd 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -501,17 +501,25 @@ class EMQ(AggregativeProbabilisticQuantifier):
     maximum-likelihood estimation, in a mutually recursive way, until convergence.
 
     :param learner: a sklearn's Estimator that generates a classifier
+    :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
+        or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
+        value of the posterior probabilities of the trianing documents as suggested in
+        `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
     """
 
     MAX_ITER = 1000
     EPSILON = 1e-4
 
-    def __init__(self, learner: BaseEstimator):
+    def __init__(self, learner: BaseEstimator, exact_train_prev=True):
         self.learner = learner
+        self.exact_train_prev = exact_train_prev
 
     def fit(self, data: LabelledCollection, fit_learner=True):
         self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
-        self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
+        if self.exact_train_prev:
+            self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
+        else:
+            self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
         return self
 
     def aggregate(self, classif_posteriors, epsilon=EPSILON):