From 8a48f679a75bb47f2571310d2c25386277677e82 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Tue, 15 Feb 2022 18:24:45 +0100
Subject: [PATCH] trying to create a quantifier that applies local adjustment
 conditioned on the posterior probabilities, which is not working

---
 eDiscovery/functions.py     | 27 +++++++++++-
 eDiscovery/main.py          | 32 ++++++--------
 eDiscovery/method.py        | 86 +++++++++++++++++++++++++++++++++++++
 eDiscovery/plot.py          | 52 ++++++++++++++++------
 quapy/method/aggregative.py |  1 +
 5 files changed, 164 insertions(+), 34 deletions(-)

diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py
index 170dfa8..b5800f2 100644
--- a/eDiscovery/functions.py
+++ b/eDiscovery/functions.py
@@ -3,11 +3,12 @@ import sklearn
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score
 from sklearn.svm import LinearSVC, SVC
 
 import quapy as qp
 from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
-    ClassWeightPCC
+    ClassWeightPCC, PosteriorConditionalAdjustemnt
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
 import numpy as np
@@ -51,6 +52,8 @@ def NewQuantifier(quantifiername, classifiername):
             # return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
             return ClassWeightPCC()
         return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
+    if quantifiername == 'PCAD':  # posterior-conditional adjustment
+        return PosteriorConditionalAdjustemnt()
     raise ValueError('unknown quantifier', quantifiername)
 
 
@@ -171,6 +174,26 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
     q.fit(train)
 
     prev = q.quantify(pool.instances)
-    return prev, None
+    return prev, q
 
 
+def eval_classifier(learner, test:LabelledCollection):
+    predictions = learner.predict(test.instances)
+    true_labels = test.labels
+    # f1 = f1_score(true_labels, predictions, average='macro')
+    f1 = f1_score(true_labels, predictions, average='binary')
+    # f1 = (true_labels==predictions).mean()
+    return f1
+
+
+def ideal_cost(classifier, pool):
+    # returns the cost (in terms of number of documents) to review until the last relevant document
+    # is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
+    # one assumes to be able to stop reviewing when the last relevant is encountered
+
+    prob = classifier.predict_proba(pool.instances)
+    order = np.argsort(prob[:,0])  # col 0 has negative posterior prob, so the natural order is "by relevance"
+    ranked_labels = pool.labels[order]
+    num_relevant = np.sum(pool.labels)
+    idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min()
+    return idealized_cost
\ No newline at end of file
diff --git a/eDiscovery/main.py b/eDiscovery/main.py
index 7f9ece1..0a740c1 100644
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@@ -9,14 +9,6 @@ from quapy.data import LabelledCollection
 from plot import eDiscoveryPlot
 
 
-def eval_classifier(learner, test:LabelledCollection):
-    predictions = learner.predict(test.instances)
-    true_labels = test.labels
-    # f1 = f1_score(true_labels, predictions, average='macro')
-    f1 = f1_score(true_labels, predictions, average='binary')
-    # f1 = (true_labels==predictions).mean()
-    return f1
-
 
 def main(args):
 
@@ -33,7 +25,7 @@ def main(args):
 
     fig = eDiscoveryPlot(args.output)
 
-    skip_first_steps = 0
+    skip_first_steps = 20
 
     with qp.util.temp_seed(args.seed):
         # initial labelled data selection
@@ -58,28 +50,31 @@ def main(args):
                 foo.flush()
                 print(msg)
 
-            tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf')
+            tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining')
 
             while True:
 
                 pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
+                ideal_cost = fn.ideal_cost(classifier, pool)
 
                 nDtr = len(train)
                 nDte = len(pool)
                 progress = 100 * nDtr / nD
 
                 if i >= skip_first_steps:
-                    pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
-                    # q.fit(train)
-                    # pool_p_hat_q = q.quantify(pool.instances)
-                    # q_classifier = q.learner
+                    pool_p_hat_q, q = fn.estimate_prev_Q(train, pool, q_name, clf_name)
 
-                    f1_clf = eval_classifier(classifier, pool)
+                    f1_clf = 0 # eval_classifier(classifier, pool)
                     f1_q = 0 #eval_classifier(q_classifier, pool)
 
                     tr_p = train.prevalence()
                     te_p = pool.prevalence()
 
+                    # this is based on an observation by D.Lewis "it is convenient to have the same kind of systematic"
+                    # error both in the numerator and in the denominator
+                    #tr_p_hat = q.quantify(train.instances)
+                    #r_hat_q = fn.recall(tr_p_hat, pool_p_hat_q, nDtr, nDte)
+
                     r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
                     r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
                     r = fn.recall(tr_p, te_p, nDtr, nDte)
@@ -89,9 +84,8 @@ def main(args):
                     ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
 
                     tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
-                        f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
-
-                    raise Exception('add idealized costs for each iteration and plots')
+                        f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
+                        f'\t{ideal_cost}\t{pool.labels.sum()}')
 
                     posteriors = classifier.predict_proba(pool.instances)
                     fig.plot(posteriors, pool.labels)
@@ -124,7 +118,7 @@ if __name__ == '__main__':
     parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
                         default=2)
     parser.add_argument('--initprev', metavar='PREV', type=float,
-                        help='prevalence of the initial sample (-1 for uniform sampling, default)',
+                        help='prevalence of the initial sample (-1 for uniform sampling)',
                         default=0.5)
     parser.add_argument('--seed', metavar='SEED', type=int,
                         help='random seed',
diff --git a/eDiscovery/method.py b/eDiscovery/method.py
index 5b74708..48f5ad5 100644
--- a/eDiscovery/method.py
+++ b/eDiscovery/method.py
@@ -5,9 +5,12 @@ from sklearn.cluster import KMeans, OPTICS
 from sklearn.decomposition import TruncatedSVD
 from sklearn.linear_model import LogisticRegression
 from sklearn.mixture import GaussianMixture
+from sklearn.model_selection import cross_val_predict
+
 from quapy.method.base import BaseQuantifier, BinaryQuantifier
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import ACC, PACC, PCC
+import quapy.functional as F
 
 
 class RegionAdjustmentQ(BaseQuantifier):
@@ -319,6 +322,89 @@ class ClassWeightPCC(BaseQuantifier):
     def get_params(self, deep=True):
         return self.learner.get_params()
 
+    @property
+    def classes_(self):
+        return self.train.classes_
+
+
+class PosteriorConditionalAdjustemnt(BaseQuantifier):
+
+    def __init__(self):
+        self.estimator = LogisticRegression()
+        self.k = 3
+
+    def get_adjustment_matrix(self, y, prob):
+        n_classes = 2
+        classes = [0, 1]
+        confusion = np.empty(shape=(n_classes, n_classes))
+        for i, class_ in enumerate(classes):
+            index = y == class_
+            if any(index):
+                confusion[i] = prob[index].mean(axis=0)
+            else:
+                if i == 0:
+                    confusion[i] = np.asarray([1,0])
+                else:
+                    confusion[i] = np.asarray([0, 1])
+
+        confusion = confusion.T
+        return confusion
+
+    def fit(self, data: LabelledCollection, fit_learner=True):
+        X, y = data.Xy
+        proba = cross_val_predict(self.estimator, X, y, n_jobs=-1, method='predict_proba')
+
+        order = np.argsort(proba[:,1])
+        proba = proba[order]
+        y = y[order]
+        X = X[order]  # to keep the alignment for the final classifier
+        n = len(data)
+        bucket_size = n // self.k
+        bucket_remainder = n % bucket_size
+        self.buckets = {}
+        self.prob_separations = []
+        for bucket in range(self.k):
+            from_pos = bucket*bucket_size
+            to_pos = (bucket+1)*bucket_size + (bucket_remainder if bucket==self.k-1 else 0)
+            slice_b = slice(from_pos, to_pos)
+            y_b = y[slice_b]
+            proba_b = proba[slice_b]
+            self.buckets[bucket] = self.get_adjustment_matrix(y_b, proba_b)
+            self.prob_separations.append(proba_b[-1,1])
+        self.prob_separations[-1] = 1  # the last one should account for the entire prob
+
+        self.estimator.fit(X,y)
+        return self
+
+    def quantify(self, instances):
+        proba = self.estimator.predict_proba(instances)
+        #proba = sorted(proba, key=lambda p:p[1])
+
+        prev = np.zeros(shape=2, dtype=np.float)
+        n = proba.shape[0]
+        last_prob_sep = 0
+        for b, prob_sep in enumerate(self.prob_separations):
+            proba_b = proba[np.logical_and(proba[:,1] >= last_prob_sep, proba[:,1] < prob_sep)]
+            last_prob_sep=prob_sep
+            if proba_b.size > 0:
+                pcc_b = F.prevalence_from_probabilities(proba_b, binarize=False)
+                adj_matrix = self.buckets[b]
+                pacc_b = ACC.solve_adjustment(adj_matrix, pcc_b)
+                bucket_prev = proba_b.shape[0] / n
+                print(f'bucket {b} -> {F.strprev(pacc_b)} with prop {bucket_prev:.4f}')
+                prev += (pacc_b*bucket_prev)
+
+        print(F.strprev(prev))
+        return prev
+
+    def set_params(self, **parameters):
+        # parameters = {p:v for p,v in parameters.items()}
+        # print(parameters)
+        self.learner.set_params(**parameters)
+
+    def get_params(self, deep=True):
+        return self.learner.get_params()
+
     @property
     def classes_(self):
         return self.train.classes_
\ No newline at end of file
diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py
index 0d1c4ed..67f33ca 100644
--- a/eDiscovery/plot.py
+++ b/eDiscovery/plot.py
@@ -5,12 +5,21 @@ import sys, os, pathlib
 
 class eDiscoveryPlot:
 
-    def __init__(self, datapath, outdir='./plots', loop=True, save=True):
+    def __init__(self, datapath, outdir='./plots', loop=True, save=True, showYdist=False, showCost=True, refreshEach=10):
         self.outdir = outdir
         self.datapath = datapath
         self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
         self.loop = loop
         self.save = save
+        self.showYdist = showYdist
+        self.showCost = showCost
+        self.refreshEach = refreshEach
+
+        nPlots = 4
+        if showYdist:
+            nPlots+=1
+        if showCost:
+            nPlots += 1
 
         if not loop:
             plt.rcParams['figure.figsize'] = [12, 12]
@@ -21,11 +30,15 @@ class eDiscoveryPlot:
             plt.rcParams.update({'font.size': 15})
 
         # plot the data
-        self.fig, self.axs = plt.subplots(5)
+        self.fig, self.axs = plt.subplots(nPlots)
         self.calls=0
 
     def plot(self, posteriors, y):
 
+        if (self.calls+1) % self.refreshEach != 0:
+            self.calls+=1
+            return
+
         fig, axs = self.fig, self.axs
         loop, save = self.loop, self.save
 
@@ -73,18 +86,31 @@ class eDiscoveryPlot:
         #axs[aXn].set_ylabel('Classifiers performance')
         #aXn += 1
 
+        if self.showCost:
+            cost = df['tr-size']
+            idealcost = df['ICost']
+            totalcost = cost + idealcost
+            axs[aXn].plot(xs, cost, label='Cost')
+            axs[aXn].plot(xs, idealcost, label='IdealCost')
+            axs[aXn].plot(xs, totalcost, label='TotalCost')
+            axs[aXn].legend()
+            axs[aXn].grid()
+            axs[aXn].set_ylabel('Cost')
+            aXn += 1
+
         # distribution of posterior probabilities in the pool
-        positive_posteriors = posteriors[y==1,1]
-        negative_posteriors = posteriors[y==0,1]
-        #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
-        #         label=['negative', 'positive'])
-        axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
-        axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
-        axs[aXn].legend()
-        axs[aXn].grid()
-        axs[aXn].set_xlim(0, 1)
-        axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
-        aXn += 1
+        if self.showYdist:
+            positive_posteriors = posteriors[y==1,1]
+            negative_posteriors = posteriors[y==0,1]
+            #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
+            #         label=['negative', 'positive'])
+            axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
+            axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
+            axs[aXn].legend()
+            axs[aXn].grid()
+            axs[aXn].set_xlim(0, 1)
+            axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
+            aXn += 1
 
         axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
         axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 08ab4c3..ce51998 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -226,6 +226,7 @@ class ACC(AggregativeQuantifier):
 
         # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
         # document that belongs to yj ends up being classified as belonging to yi
+        self.conf_matrix_ = confusion_matrix(y, y_).T
         self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
 
         return self