refactoring main, argparse, etc

2022-01-17 17:57:14 +01:00 · 2022-01-17 17:57:14 +01:00 · 833476ebf8
parent b051ed4781
commit 833476ebf8
3 changed files with 151 additions and 192 deletions
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@ -1,182 +1,115 @@
 import os.path
-import sys

-import sklearn
-
-from sklearn.base import BaseEstimator
-from sklearn.calibration import CalibratedClassifierCV
-from sklearn.linear_model import LogisticRegression
-from sklearn.svm import LinearSVC
+from sklearn.metrics import f1_score

+import functions as fn
 import quapy as qp
-from method.base import BaseQuantifier
-from quapy.data import LabelledCollection
-from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
-from quapy import functional as F
-import numpy as np
-from itertools import chain
+import argparse
+from data import LabelledCollection


-def split_from_index(collection: LabelledCollection, index: np.ndarray):
-    in_index_set = set(index)
-    out_index_set = set(range(len(collection))) - in_index_set
-    out_index = np.asarray(list(out_index_set), dtype=int)
-    return collection.sampling_from_index(index), collection.sampling_from_index(out_index)
+def eval_classifier(learner, test:LabelledCollection):
+    predictions = learner.predict(test.instances)
+    true_labels = test.labels
+    # f1 = f1_score(true_labels, predictions, average='macro')
+    f1 = f1_score(true_labels, predictions, average='binary')
+    # f1 = (true_labels==predictions).mean()
+    return f1


-def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
-    prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
-    top_relevant_idx = np.argsort(-prob)[:k]
-    return top_relevant_idx
+def main(args):
+
+    datasetname = args.dataset
+    k = args.k
+    init_nD = args.initsize
+    init_prev = [1-args.initprev, args.initprev]
+    sampling_fn = getattr(fn, args.sampling)
+    max_iterations = args.iter
+    outputdir = './results'
+
+    collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
+    nD = len(collection)
+
+    with qp.util.temp_seed(args.seed):
+        # initial labelled data selection
+        idx = collection.sampling_index(init_nD, *init_prev)
+        train, pool = fn.split_from_index(collection, idx)
+        first_train = LabelledCollection(train.instances, train.labels)
+
+        # recall_target = 0.99
+        qp.util.create_if_not_exist(outputdir)
+
+        i = 0
+        with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo:
+            def tee(msg):
+                foo.write(msg + '\n')
+                foo.flush()
+                print(msg)
+
+            tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf')
+
+            while True:
+
+                pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool)
+                pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier)
+
+                f1_clf = eval_classifier(classifier, pool)
+                f1_q = eval_classifier(q_classifier, pool)
+
+                tr_p = train.prevalence()
+                te_p = pool.prevalence()
+                nDtr = len(train)
+                nDte = len(pool)
+
+                r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
+                r_hat = fn.recall(tr_p, pool_p_hat, nDtr, nDte)
+                r = fn.recall(tr_p, te_p, nDtr, nDte)
+                tr_te_shift = qp.error.ae(tr_p, te_p)
+
+                progress = 100 * nDtr / nD
+
+                q_ae = qp.error.ae(te_p, pool_p_hat)
+                cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
+
+                tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
+                    f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
+
+                if nDte < k:
+                    print('[stop] too few documents remaining')
+                    break
+                elif i+1 == max_iterations:
+                    print('[stop] maximum number of iterations reached')
+                    break
+
+                top_relevant_idx = sampling_fn(pool, classifier, k, progress)
+                selected, pool = fn.split_from_index(pool, top_relevant_idx)
+                train = train + selected
+
+                i += 1


-def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
-    prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
-    top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k]
-    return top_uncertain_idx
+if __name__=='__main__':
+    parser = argparse.ArgumentParser(description='e-Discovery')
+    parser.add_argument('--dataset', metavar='DATASET', type=str, help='Dataset name',
+                        default='RCV1.C4')
+    parser.add_argument('--quantifier', metavar='METHOD', type=str, help='Quantification method',
+                        default='EMQ')
+    parser.add_argument('--sampling', metavar='SAMPLING', type=str, help='Sampling criterion',
+                        default='relevance_sampling')
+    parser.add_argument('--iter', metavar='INT', type=int, help='number of iterations (-1 to set no limit)',
+                        default=-1)
+    parser.add_argument('--k', metavar='BATCH', type=int, help='number of documents in a batch',
+                        default=100)
+    parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
+                        default=1000)
+    parser.add_argument('--initprev', metavar='PREV', type=float,
+                        help='prevalence of the initial sample (-1 for uniform sampling)',
+                        default=0.5)
+    parser.add_argument('--seed', metavar='SEED', type=int,
+                        help='random seed',
+                        default=1)
+    args = parser.parse_args()

+    assert 0 < args.initprev < 1, 'wrong value for initsize; should be in (0., 1.)'

-def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
-    relevance_idx = relevance_sampling_index(pool, classifier, k)
-    uncertanty_idx = uncertainty_sampling_index(pool, classifier, k)
-    interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
-    _, unique_idx = np.unique(interleave_idx, return_index=True)
-    top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
-    return top_interleaved_idx
-
-
-def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
-    prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
-    top_relevant_idx = np.argsort(-prob)[:k]
-    return top_relevant_idx
-
-
-def recall(train_prev, pool_prev, train_size, pool_size):
-    frac_tr_pos = train_prev[1]
-    frac_te_pos = pool_prev[1]
-    recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
-    return recall
-
-
-def NewClassifier():
-    # return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
-    return LogisticRegression(class_weight=None)
-
-
-def NewQuantifier():
-    return EMQ(CalibratedClassifierCV(NewClassifier()))
-
-
-def create_dataset(datasetname):
-    if datasetname == 'imdb.10K.75p':
-        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
-        collection = data.training.sampling(10000, 0.75)
-        return collection
-
-    elif datasetname == 'RCV1.C4':
-        X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
-        y = y.toarray()
-        prev = y.mean(axis=0).flatten()
-        # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
-        # this category happens to be the cat with id 4
-        target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0]
-        print('chosen cat', target_cat)
-        y = y[:, target_cat].flatten()
-        return LabelledCollection(X, y)
-
-
-def estimate_prev_CC(train, pool):
-    q = CC(NewClassifier()).fit(train)
-    return q.quantify(pool.instances), q.learner
-
-
-def estimate_prev_Q(train, pool, classifier):
-    # q = qp.model_selection.GridSearchQ(
-    #     ACC(LogisticRegression()),
-    #     param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
-    #     sample_size=len(train),
-    #     protocol='app',
-    #     n_prevpoints=21,
-    #     n_repetitions=10)
-
-    q = NewQuantifier()
-    # q = ACC(NewClassifier())
-    # borrow (supposedly negative) pool documents
-    # train_pos = train.counts()[1]
-    # train_negs = train.counts()[0]
-    # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
-    # neg_sample = pool.sampling_from_index(neg_idx)
-    # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
-    # q.fit(train_augmented)
-    q.fit(train)
-    # q.fit(first_train)
-    # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
-    prev = q.quantify(pool.instances)
-    return prev, None
-    # return q.quantify(pool_instances), None
-
-
-def tee(msg):
-    foo.write(msg + '\n')
-    foo.flush()
-    print(msg)
-
-
-datasetname = 'RCV1.C4'
-collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
-nD = len(collection)
-
-# initial labelled data selection
-init_nD = 1000
-init_prev = [0.5, 0.5]
-idx = collection.sampling_index(init_nD, *init_prev)
-train, pool = split_from_index(collection, idx)
-#first_train = LabelledCollection(train.instances, train.labels)
-
-k = 100
-recall_target = 0.99
-
-outputdir = './results'
-qp.util.create_if_not_exist(outputdir)
-
-# sampling_fn, sampling_name = relevance_sampling_index, 'relevance'
-sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix'
-q_name = NewQuantifier().__class__.__name__
-
-experiment_suffix = f'{sampling_name}_{q_name}'
-
-i = 0
-with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo:
-    tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
-    while True:
-
-        pool_p_hat_cc, classifier = estimate_prev_CC(train, pool)
-        pool_p_hat, _ = estimate_prev_Q(train, pool, classifier)
-
-        tr_p = train.prevalence()
-        te_p = pool.prevalence()
-        nDtr = len(train)
-        nDte = len(pool)
-
-        r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte)
-        r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
-        r = recall(tr_p, te_p, nDtr, nDte)
-        tr_te_shift = qp.error.ae(tr_p, te_p)
-
-        progress = 100 * nDtr / nD
-
-        q_ae = qp.error.ae(te_p, pool_p_hat)
-        cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
-
-        tee(
-            f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
-            f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
-
-        if nDte < k:
-            break
-
-        top_relevant_idx = sampling_fn(pool, classifier, k)
-        selected, pool = split_from_index(pool, top_relevant_idx)
-        train = train + selected
-
-        i += 1
+    main(args)
--- a/eDiscovery/plot.py
+++ b/eDiscovery/plot.py
@ -5,16 +5,23 @@ import sys, os, pathlib

 assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'

-file = sys.argv[1]  #'./results/RCV1.C4.csv'
+file = sys.argv[1]
 loop = bool(int(sys.argv[2]))

 plotname = pathlib.Path(file).name.replace(".csv", ".png")

+if not loop:
+    plt.rcParams['figure.figsize'] = [12, 12]
+    plt.rcParams['figure.dpi'] = 200
+
 # plot the data
-fig, axs = plt.subplots(3)
+fig, axs = plt.subplots(5)
+
+

 try:
    while True:
+        aXn = 0
        df = pd.read_csv(file, sep='\t')

        xs = df['it']
@ -22,29 +29,49 @@ try:
        y_r = df['R']
        y_rhat = df['Rhat']
        y_rhatCC = df['RhatCC']
-        label='R'
-        axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$')
-        axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$')
-        axs[0].plot(xs, y_r, label=label)
-        axs[0].legend()
-        axs[0].grid()
+        axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
+        axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
+        axs[aXn].plot(xs, y_r, label='$R$')
+        axs[aXn].legend()
+        axs[aXn].grid()
+        axs[aXn].set_ylabel('Recall estimation')
+        axs[aXn].set_ylim(0,1)
+        aXn+=1

        y_r = df['te-prev']
        y_rhat = df['te-estim']
        y_rhatCC = df['te-estimCC']
-        label='P'
-        axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$')
-        axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$')
-        axs[1].plot(xs, y_r, label=label)
-        axs[1].legend()
-        axs[1].grid()
+        axs[aXn].plot(xs, y_rhat, label='te-$\hat{Pr}(\oplus)_{Q}$')
+        axs[aXn].plot(xs, y_rhatCC, label='te-$\hat{Pr}(\oplus)_{CC}$')
+        axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
+        axs[aXn].legend()
+        axs[aXn].grid()
+        axs[aXn].set_ylabel('Prevalence estimation')
+        aXn += 1

        y_ae = df['AE']
        y_ae_cc = df['AE_CC']
-        axs[2].plot(xs, y_ae, label='AE')
-        axs[2].plot(xs, y_ae_cc, label='AE-CC')
-        axs[2].legend()
-        axs[2].grid()
+        axs[aXn].plot(xs, y_ae, label='AE$_{Q}$')
+        axs[aXn].plot(xs, y_ae_cc, label='AE$_{CC}$')
+        axs[aXn].legend()
+        axs[aXn].grid()
+        axs[aXn].set_ylabel('Quantification error')
+        aXn += 1
+
+        axs[aXn].plot(xs, df['Shift'], label='tr-te shift (AE)')
+        axs[aXn].plot(xs, df['tr-prev'], label='tr-$Pr(\oplus)$')
+        axs[aXn].plot(xs, df['te-prev'], label='te-$Pr(\oplus)$')
+        axs[aXn].legend()
+        axs[aXn].grid()
+        axs[aXn].set_ylabel('Train-Test Shift')
+        aXn += 1
+
+        axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
+        axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
+        axs[aXn].legend()
+        axs[aXn].grid()
+        axs[aXn].set_ylabel('Classifiers performance')
+        aXn += 1

        os.makedirs('./plots', exist_ok=True)
        plt.savefig(f'./plots/{plotname}')
@ -53,9 +80,8 @@ try:
            break
        else:
            plt.pause(.5)
-            axs[0].cla()
-            axs[1].cla()
-            axs[2].cla()
+            for i in range(aXn):
+                axs[i].cla()

 except KeyboardInterrupt:
    print("\n[exit]")
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -143,7 +143,7 @@ class LabelledCollection:
        else:
            raise NotImplementedError('unsupported operation for collection types')
        labels = np.concatenate([self.labels, other.labels])
-        return LabelledCollection(join_instances, labels)
+        return LabelledCollection(join_instances, labels, classes_=self.classes_)

    @property
    def Xy(self):