plotting y distributions over time

2022-02-23 15:29:39 +01:00 · 2022-02-23 15:29:39 +01:00 · 29993386ae
parent 9053ae34a6
commit 29993386ae
3 changed files with 73 additions and 24 deletions
--- a/eDiscovery/functions.py
+++ b/eDiscovery/functions.py
@ -162,14 +162,6 @@ def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
 def estimate_prev_Q(train, pool, quantifiername, classifiername):
    # q = qp.model_selection.GridSearchQ(
    #     ACC(LogisticRegression()),
    #     param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
    #     sample_size=len(train),
    #     protocol='app',
    #     n_prevpoints=21,
    #     n_repetitions=10)
    q = NewQuantifier(quantifiername, classifiername)
    # q._find_regions((train+pool).instances)
    q.fit(train)
@ -181,16 +173,14 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
 def eval_classifier(learner, test:LabelledCollection):
    predictions = learner.predict(test.instances)
    true_labels = test.labels
    # f1 = f1_score(true_labels, predictions, average='macro')
    f1 = f1_score(true_labels, predictions, average='binary')
    # f1 = (true_labels==predictions).mean()
    return f1
 def ideal_cost(classifier, pool):
    # returns the cost (in terms of number of documents) to review until the last relevant document
    # is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
-    # one assumes to be able to stop reviewing when the last relevant is encountered
+    # one assumes to know the optimal stopping point (reached after the last relevant is encountered)
    prob = classifier.predict_proba(pool.instances)
    order = np.argsort(prob[:,0])  # col 0 has negative posterior prob, so the natural order is "by relevance"
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@ -6,8 +6,7 @@ import functions as fn
 import quapy as qp
 import argparse
 from quapy.data import LabelledCollection
-from plot import eDiscoveryPlot
+from plot import eDiscoveryPlot, InOutDistPlot
 def main(args):
@ -23,9 +22,10 @@ def main(args):
    collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
    nD = len(collection)
-    fig = eDiscoveryPlot(args.output)
+    # fig = eDiscoveryPlot(args.output)
    fig_dist = InOutDistPlot()
-    skip_first_steps = 20
+    skip_first_steps = 1
    with qp.util.temp_seed(args.seed):
        # initial labelled data selection
@ -34,23 +34,20 @@ def main(args):
        else:
            idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev])
        train, pool = fn.split_from_index(collection, idx)
        #first_train = LabelledCollection(train.instances, train.labels)
        # recall_target = 0.99
        i = 0
        # q = fn.NewQuantifier(q_name, clf_name)
        # print('searching regions')
        # q._find_regions((train+pool).instances)
        # print('[done]')
        with open(args.output, 'wt') as foo:
            def tee(msg):
                foo.write(msg + '\n')
                foo.flush()
                print(msg)
-            tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining')
+            tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC'
                '\tMF1_Q\tMF1_Clf\tICost\tremaining\tba-prev\tba-estim')
            batch_prev_estim, batch_prev_true, q = 0, 0, None
            while True:
@ -85,10 +82,12 @@ def main(args):
                    tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
                        f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
-                        f'\t{ideal_cost}\t{pool.labels.sum()}')
+                        f'\t{ideal_cost}\t{pool.labels.sum()}\t{batch_prev_true}\t{batch_prev_estim:.3f}')
                    posteriors = classifier.predict_proba(pool.instances)
-                    fig.plot(posteriors, pool.labels)
+                    in_posteriors = classifier.predict_proba(train.instances)
                    # fig.plot(posteriors, pool.labels)
                    fig_dist.plot(in_posteriors, train.labels, posteriors, pool.labels)
                    if nDte < k:
                        print('[stop] too few documents remaining')
@ -98,6 +97,12 @@ def main(args):
                        break
                top_relevant_idx = sampling_fn(pool, classifier, k, progress)
                if q is not None:
                    batch = pool.sampling_from_index(top_relevant_idx)
                    batch_prev_estim = q.quantify(batch.instances)[1]
                    batch_prev_true  = batch.prevalence()[1]
                train, pool = fn.move_documents(train, pool, top_relevant_idx)
                i += 1
--- a/eDiscovery/plot.py
+++ b/eDiscovery/plot.py
@ -142,6 +142,60 @@ class eDiscoveryPlot:
        self.calls += 1
 class InOutDistPlot:
    def __init__(self, refreshEach=1):
        self.refreshEach = refreshEach
        # plot the data
        self.fig, self.axs = plt.subplots(2)
        self.calls = 0
    def _plot_dist(self, posteriors, y, aXn, title):
        positive_posteriors = posteriors[y == 1, 1]
        negative_posteriors = posteriors[y == 0, 1]
        self.axs[aXn].hist(negative_posteriors, bins=50, label='$Pr(x|\ominus)$', density=False, alpha=.75)
        self.axs[aXn].hist(positive_posteriors, bins=50, label='$Pr(x|\oplus)$', density=False, alpha=.75)
        self.axs[aXn].legend()
        self.axs[aXn].grid()
        self.axs[aXn].set_xlim(0, 1)
        self.axs[aXn].set_ylabel(title)
    def plot(self, in_posteriors, in_y, out_posteriors, out_y):
        if (self.calls+1) % self.refreshEach != 0:
            self.calls += 1
            return
        fig, axs = self.fig, self.axs
        aXn = 0
        # in-posteriors distribution
        self._plot_dist(in_posteriors, in_y, aXn, title='training distribution')
        aXn += 1
        # out-posteriors distribution
        self._plot_dist(out_posteriors, out_y, aXn, title='pool distribution')
        aXn += 1
        for i in range(aXn):
            if self.calls==0:
                # Shrink current axis by 20%
                box = axs[i].get_position()
                axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
                fig.tight_layout()
            # Put a legend to the right of the current axis
            axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.pause(.5)
        for i in range(aXn):
            axs[i].cla()
        self.calls += 1
 if __name__ == '__main__':
    assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'