From 6ea627449ce69da85a203dea518f7cd41a38bfef Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Wed, 26 Jan 2022 19:04:13 +0100
Subject: [PATCH] adding hist plot

---
 eDiscovery/functions.py |  21 ++++---
 eDiscovery/main.py      |  60 +++++++++++---------
 eDiscovery/method.py    | 119 +++++++++++++++++++++++++++++++++++++---
 eDiscovery/plot.py      |  47 ++++++++++++----
 4 files changed, 196 insertions(+), 51 deletions(-)

diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py
index 7d4a3fe..170dfa8 100644
--- a/eDiscovery/functions.py
+++ b/eDiscovery/functions.py
@@ -6,7 +6,8 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.svm import LinearSVC, SVC
 
 import quapy as qp
-from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal
+from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
+    ClassWeightPCC
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
 import numpy as np
@@ -27,6 +28,8 @@ def NewQuantifier(quantifiername, classifiername):
     if quantifiername == 'EMQ':
         return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
         # return EMQ(NewClassifier(classifier))
+    if quantifiername == 'CC':
+        return CC(NewClassifier(classifiername))
     if quantifiername == 'HDy':
         return HDy(NewClassifier(classifiername))
     if quantifiername == 'PCC':
@@ -35,14 +38,18 @@ def NewQuantifier(quantifiername, classifiername):
         return ACC(NewClassifier(classifiername), val_split=0.4)
     if quantifiername == 'PACC':
         return PACC(NewClassifier(classifiername), val_split=0.4)
-    if quantifiername == 'RACC':
-        return RegionAdjustment(NewClassifier(classifiername), val_split=0.4)
-    if quantifiername == 'RPACC':
-        return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10)
-    if quantifiername == 'GRPACC':
+    if quantifiername == 'CW':
+        return ClassWeightPCC()
+    if quantifiername == 'SRSQ':  # supervised regions, then single-label quantification
+        #q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
+        #q = PACC(NewClassifier(classifiername), val_split=0.4)
+        q = ACC(NewClassifier(classifiername))
+        return RegionAdjustmentQ(q, k=4)
+    if quantifiername == 'URBQ':  # unsupervised regions, then binary quantifications
         def newQ():
             # return PACC(NewClassifier(classifiername), val_split=0.4)
-            return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
+            # return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
+            return ClassWeightPCC()
         return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
     raise ValueError('unknown quantifier', quantifiername)
 
diff --git a/eDiscovery/main.py b/eDiscovery/main.py
index 6581b3d..ded1f62 100644
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@@ -33,6 +33,8 @@ def main(args):
 
     fig = eDiscoveryPlot(args.output)
 
+    skip_first_steps = 0
+
     with qp.util.temp_seed(args.seed):
         # initial labelled data selection
         if args.initprev == -1:
@@ -61,40 +63,43 @@ def main(args):
             while True:
 
                 pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
-                pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
-                # q.fit(train)
-                # pool_p_hat_q = q.quantify(pool.instances)
-                # q_classifier = q.learner
 
-                f1_clf = eval_classifier(classifier, pool)
-                f1_q = 0 #eval_classifier(q_classifier, pool)
-
-                tr_p = train.prevalence()
-                te_p = pool.prevalence()
                 nDtr = len(train)
                 nDte = len(pool)
-
-                r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
-                r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
-                r = fn.recall(tr_p, te_p, nDtr, nDte)
-                tr_te_shift = qp.error.ae(tr_p, te_p)
-
                 progress = 100 * nDtr / nD
 
-                ae_q = qp.error.ae(te_p, pool_p_hat_q)
-                ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
+                if i >= skip_first_steps:
+                    pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
+                    # q.fit(train)
+                    # pool_p_hat_q = q.quantify(pool.instances)
+                    # q_classifier = q.learner
 
-                tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
-                    f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
+                    f1_clf = eval_classifier(classifier, pool)
+                    f1_q = 0 #eval_classifier(q_classifier, pool)
 
-                fig.plot()
+                    tr_p = train.prevalence()
+                    te_p = pool.prevalence()
 
-                if nDte < k:
-                    print('[stop] too few documents remaining')
-                    break
-                elif i+1 == max_iterations:
-                    print('[stop] maximum number of iterations reached')
-                    break
+                    r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
+                    r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
+                    r = fn.recall(tr_p, te_p, nDtr, nDte)
+                    tr_te_shift = qp.error.ae(tr_p, te_p)
+
+                    ae_q = qp.error.ae(te_p, pool_p_hat_q)
+                    ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
+
+                    tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
+                        f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
+
+                    posteriors = classifier.predict_proba(pool.instances)
+                    fig.plot(posteriors, pool.labels)
+
+                    if nDte < k:
+                        print('[stop] too few documents remaining')
+                        break
+                    elif i+1 == max_iterations:
+                        print('[stop] maximum number of iterations reached')
+                        break
 
                 top_relevant_idx = sampling_fn(pool, classifier, k, progress)
                 train, pool = fn.move_documents(train, pool, top_relevant_idx)
@@ -142,4 +147,7 @@ if __name__ == '__main__':
     if outputdir:
         qp.util.create_if_not_exist(outputdir)
 
+    for k,v in args.__dict__.items():
+        print(f'{k}={v}')
+
     main(args)
diff --git a/eDiscovery/method.py b/eDiscovery/method.py
index 3278e9c..5b74708 100644
--- a/eDiscovery/method.py
+++ b/eDiscovery/method.py
@@ -3,10 +3,59 @@ import numpy as np
 from sklearn.base import BaseEstimator, clone
 from sklearn.cluster import KMeans, OPTICS
 from sklearn.decomposition import TruncatedSVD
+from sklearn.linear_model import LogisticRegression
 from sklearn.mixture import GaussianMixture
 from quapy.method.base import BaseQuantifier, BinaryQuantifier
 from quapy.data import LabelledCollection
-from quapy.method.aggregative import ACC, PACC
+from quapy.method.aggregative import ACC, PACC, PCC
+
+
+class RegionAdjustmentQ(BaseQuantifier):
+
+    def __init__(self, quantifier: BaseQuantifier, k=10):
+        self.quantifier = quantifier
+        self.k = k # number of regions
+
+    def fit(self, data: LabelledCollection):
+        X, y = data.Xy
+        Xp, Xn = X[y==1], X[y==0]
+
+        nk_per_class = (data.prevalence() * self.k).round().astype(int)
+        print(f'number of regions per class {nk_per_class}')
+
+        kmeans_neg = KMeans(n_clusters=nk_per_class[0])
+        rn = kmeans_neg.fit_predict(Xn)  # regions negative
+
+        kmeans_pos = KMeans(n_clusters=nk_per_class[1])
+        rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0]  # regions positive
+
+        classes = np.arange(self.k)
+        pos = LabelledCollection(Xp, rp, classes_=classes)
+        neg = LabelledCollection(Xn, rn, classes_=classes)
+
+        region_data = pos + neg
+        self.quantifier.fit(region_data)
+
+        self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
+
+        return self
+
+    def quantify(self, instances):
+        region_prevalence = self.quantifier.quantify(instances)
+        bin_prevalence = np.zeros(shape=2, dtype=np.float)
+        for r, prev in enumerate(region_prevalence):
+            bin_prevalence[self.reg2class[r]] += prev
+        return bin_prevalence
+
+    def set_params(self, **parameters):
+        pass
+
+    def get_params(self, deep=True):
+        pass
+
+    @property
+    def classes_(self):
+        return np.asarray([0,1])
 
 
 class RegionAdjustment(ACC):
@@ -20,15 +69,25 @@ class RegionAdjustment(ACC):
     def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
         X, y = data.Xy
         Xp, Xn = X[y==1], X[y==0]
-        kmeans = KMeans(n_clusters=self.k)
-        rn = kmeans.fit_predict(Xn)  # regions negative
-        rp = kmeans.fit_predict(Xp)+self.k  # regions positive
-        classes = np.arange(self.k*2)
+
+        nk_per_class = (data.prevalence() * self.k).round().astype(int)
+        print(f'number of clusters per class {nk_per_class}')
+
+        kmeans_neg = KMeans(n_clusters=nk_per_class[0])
+        rn = kmeans_neg.fit_predict(Xn)  # regions negative
+
+        kmeans_pos = KMeans(n_clusters=nk_per_class[1])
+        rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0]  # regions positive
+
+        classes = np.arange(self.k)
         pos = LabelledCollection(Xp, rp, classes_=classes)
         neg = LabelledCollection(Xn, rn, classes_=classes)
+
         region_data = pos + neg
-        super(RegionAdjustment, self).fit(region_data, fit_learner, val_split)
-        self.reg2class = {r:(0 if r < self.k else 1) for r in range(2*self.k)}
+        super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split)
+
+        self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
+
         return self
 
     def classify(self, data):
@@ -218,4 +277,48 @@ class TrivialAcceptorQuantifier(BinaryQuantifier):
 
     @property
     def classes_(self):
-        return np.asarray([0,1])
\ No newline at end of file
+        return np.asarray([0,1])
+
+
+class ClassWeightPCC(BaseQuantifier):
+
+    def __init__(self, estimator=LogisticRegression):
+        self.estimator = estimator
+        self.learner = PACC(self.estimator())
+        self.deployed = False
+
+    def fit(self, data: LabelledCollection, fit_learner=True):
+        self.train = data
+        self.learner.fit(self.train)
+        return self
+
+    def quantify(self, instances):
+        guessed_prevalence = self.learner.quantify(instances)
+        class_weight = self._get_class_weight(guessed_prevalence)
+        base_estimator = clone(self.learner.learner)
+        base_estimator.set_params(class_weight=class_weight)
+        pcc = PCC(base_estimator)
+        return pcc.fit(self.train).quantify(instances)
+
+    def _get_class_weight(self, prevalence):
+        # class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence))
+        # return {0: class_weight[1], 1: class_weight[0]}
+        # weights = prevalence/prevalence.min()
+        weights = prevalence / self.train.prevalence()
+        normfactor = weights.min()
+        if normfactor <= 0:
+            normfactor = 1E-3
+        weights /= normfactor
+        return {0:weights[0], 1:weights[1]}
+
+    def set_params(self, **parameters):
+        # parameters = {p:v for p,v in parameters.items()}
+        # print(parameters)
+        self.learner.set_params(**parameters)
+
+    def get_params(self, deep=True):
+        return self.learner.get_params()
+
+    @property
+    def classes_(self):
+        return self.train.classes_
\ No newline at end of file
diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py
index 064d98f..0d1c4ed 100644
--- a/eDiscovery/plot.py
+++ b/eDiscovery/plot.py
@@ -16,14 +16,16 @@ class eDiscoveryPlot:
             plt.rcParams['figure.figsize'] = [12, 12]
             plt.rcParams['figure.dpi'] = 200
         else:
-            plt.rcParams['figure.figsize'] = [17, 17]
-            plt.rcParams['figure.dpi'] = 60
-
+            plt.rcParams['figure.figsize'] = [14, 18]
+            plt.rcParams['figure.dpi'] = 50
+            plt.rcParams.update({'font.size': 15})
 
         # plot the data
         self.fig, self.axs = plt.subplots(5)
+        self.calls=0
+
+    def plot(self, posteriors, y):
 
-    def plot(self):
         fig, axs = self.fig, self.axs
         loop, save = self.loop, self.save
 
@@ -38,7 +40,6 @@ class eDiscoveryPlot:
         axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
         axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
         axs[aXn].plot(xs, y_r, label='$R$')
-        axs[aXn].legend()
         axs[aXn].grid()
         axs[aXn].set_ylabel('Recall')
         axs[aXn].set_ylim(0, 1)
@@ -52,7 +53,7 @@ class eDiscoveryPlot:
         axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
         axs[aXn].legend()
         axs[aXn].grid()
-        axs[aXn].set_ylabel('Prevalence')
+        axs[aXn].set_ylabel('Pool prevalence')
         aXn += 1
 
         y_ae = df['AE']
@@ -64,14 +65,28 @@ class eDiscoveryPlot:
         axs[aXn].set_ylabel('Quantification error')
         aXn += 1
 
-        axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
-        axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
+        # classifier performance (not very reliable)
+        #axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
+        #axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
+        #axs[aXn].legend()
+        #axs[aXn].grid()
+        #axs[aXn].set_ylabel('Classifiers performance')
+        #aXn += 1
+
+        # distribution of posterior probabilities in the pool
+        positive_posteriors = posteriors[y==1,1]
+        negative_posteriors = posteriors[y==0,1]
+        #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
+        #         label=['negative', 'positive'])
+        axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
+        axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
         axs[aXn].legend()
         axs[aXn].grid()
-        axs[aXn].set_ylabel('Classifiers performance')
+        axs[aXn].set_xlim(0, 1)
+        axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
         aXn += 1
 
-        axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)')
+        axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
         axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
         axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
         axs[aXn].legend()
@@ -79,6 +94,16 @@ class eDiscoveryPlot:
         axs[aXn].set_ylabel('Train-Test Shift')
         aXn += 1
 
+        for i in range(aXn):
+            if self.calls==0:
+                # Shrink current axis by 20%
+                box = axs[i].get_position()
+                axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
+                fig.tight_layout()
+
+            # Put a legend to the right of the current axis
+            axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
         if save:
             os.makedirs(self.outdir, exist_ok=True)
             plt.savefig(f'{self.outdir}/{self.plotname}')
@@ -88,6 +113,8 @@ class eDiscoveryPlot:
             for i in range(aXn):
                 axs[i].cla()
 
+        self.calls += 1
+
 
 if __name__ == '__main__':