From 833476ebf852814a3de7b612de60af4d84535014 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 17 Jan 2022 17:57:14 +0100 Subject: [PATCH] refactoring main, argparse, etc --- eDiscovery/main.py | 273 +++++++++++++++++---------------------------- eDiscovery/plot.py | 68 +++++++---- quapy/data/base.py | 2 +- 3 files changed, 151 insertions(+), 192 deletions(-) diff --git a/eDiscovery/main.py b/eDiscovery/main.py index c9d096b..99a6811 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -1,182 +1,115 @@ import os.path -import sys -import sklearn - -from sklearn.base import BaseEstimator -from sklearn.calibration import CalibratedClassifierCV -from sklearn.linear_model import LogisticRegression -from sklearn.svm import LinearSVC +from sklearn.metrics import f1_score +import functions as fn import quapy as qp -from method.base import BaseQuantifier -from quapy.data import LabelledCollection -from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC -from quapy import functional as F -import numpy as np -from itertools import chain +import argparse +from data import LabelledCollection -def split_from_index(collection: LabelledCollection, index: np.ndarray): - in_index_set = set(index) - out_index_set = set(range(len(collection))) - in_index_set - out_index = np.asarray(list(out_index_set), dtype=int) - return collection.sampling_from_index(index), collection.sampling_from_index(out_index) +def eval_classifier(learner, test:LabelledCollection): + predictions = learner.predict(test.instances) + true_labels = test.labels + # f1 = f1_score(true_labels, predictions, average='macro') + f1 = f1_score(true_labels, predictions, average='binary') + # f1 = (true_labels==predictions).mean() + return f1 -def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): - prob = classifier.predict_proba(pool.instances)[:, 1].flatten() - top_relevant_idx = np.argsort(-prob)[:k] - return top_relevant_idx +def main(args): + + datasetname = args.dataset + k = args.k + init_nD = args.initsize + init_prev = [1-args.initprev, args.initprev] + sampling_fn = getattr(fn, args.sampling) + max_iterations = args.iter + outputdir = './results' + + collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname) + nD = len(collection) + + with qp.util.temp_seed(args.seed): + # initial labelled data selection + idx = collection.sampling_index(init_nD, *init_prev) + train, pool = fn.split_from_index(collection, idx) + first_train = LabelledCollection(train.instances, train.labels) + + # recall_target = 0.99 + qp.util.create_if_not_exist(outputdir) + + i = 0 + with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo: + def tee(msg): + foo.write(msg + '\n') + foo.flush() + print(msg) + + tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf') + + while True: + + pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool) + pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier) + + f1_clf = eval_classifier(classifier, pool) + f1_q = eval_classifier(q_classifier, pool) + + tr_p = train.prevalence() + te_p = pool.prevalence() + nDtr = len(train) + nDte = len(pool) + + r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte) + r_hat = fn.recall(tr_p, pool_p_hat, nDtr, nDte) + r = fn.recall(tr_p, te_p, nDtr, nDte) + tr_te_shift = qp.error.ae(tr_p, te_p) + + progress = 100 * nDtr / nD + + q_ae = qp.error.ae(te_p, pool_p_hat) + cc_ae = qp.error.ae(te_p, pool_p_hat_cc) + + tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' + f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') + + if nDte < k: + print('[stop] too few documents remaining') + break + elif i+1 == max_iterations: + print('[stop] maximum number of iterations reached') + break + + top_relevant_idx = sampling_fn(pool, classifier, k, progress) + selected, pool = fn.split_from_index(pool, top_relevant_idx) + train = train + selected + + i += 1 -def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): - prob = classifier.predict_proba(pool.instances)[:, 1].flatten() - top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k] - return top_uncertain_idx +if __name__=='__main__': + parser = argparse.ArgumentParser(description='e-Discovery') + parser.add_argument('--dataset', metavar='DATASET', type=str, help='Dataset name', + default='RCV1.C4') + parser.add_argument('--quantifier', metavar='METHOD', type=str, help='Quantification method', + default='EMQ') + parser.add_argument('--sampling', metavar='SAMPLING', type=str, help='Sampling criterion', + default='relevance_sampling') + parser.add_argument('--iter', metavar='INT', type=int, help='number of iterations (-1 to set no limit)', + default=-1) + parser.add_argument('--k', metavar='BATCH', type=int, help='number of documents in a batch', + default=100) + parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning', + default=1000) + parser.add_argument('--initprev', metavar='PREV', type=float, + help='prevalence of the initial sample (-1 for uniform sampling)', + default=0.5) + parser.add_argument('--seed', metavar='SEED', type=int, + help='random seed', + default=1) + args = parser.parse_args() + assert 0 < args.initprev < 1, 'wrong value for initsize; should be in (0., 1.)' -def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): - relevance_idx = relevance_sampling_index(pool, classifier, k) - uncertanty_idx = uncertainty_sampling_index(pool, classifier, k) - interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx)))) - _, unique_idx = np.unique(interleave_idx, return_index=True) - top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k] - return top_interleaved_idx - - -def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): - prob = classifier.predict_proba(pool.instances)[:, 0].flatten() - top_relevant_idx = np.argsort(-prob)[:k] - return top_relevant_idx - - -def recall(train_prev, pool_prev, train_size, pool_size): - frac_tr_pos = train_prev[1] - frac_te_pos = pool_prev[1] - recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size) - return recall - - -def NewClassifier(): - # return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) - return LogisticRegression(class_weight=None) - - -def NewQuantifier(): - return EMQ(CalibratedClassifierCV(NewClassifier())) - - -def create_dataset(datasetname): - if datasetname == 'imdb.10K.75p': - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) - collection = data.training.sampling(10000, 0.75) - return collection - - elif datasetname == 'RCV1.C4': - X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True) - y = y.toarray() - prev = y.mean(axis=0).flatten() - # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery) - # this category happens to be the cat with id 4 - target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0] - print('chosen cat', target_cat) - y = y[:, target_cat].flatten() - return LabelledCollection(X, y) - - -def estimate_prev_CC(train, pool): - q = CC(NewClassifier()).fit(train) - return q.quantify(pool.instances), q.learner - - -def estimate_prev_Q(train, pool, classifier): - # q = qp.model_selection.GridSearchQ( - # ACC(LogisticRegression()), - # param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']}, - # sample_size=len(train), - # protocol='app', - # n_prevpoints=21, - # n_repetitions=10) - - q = NewQuantifier() - # q = ACC(NewClassifier()) - # borrow (supposedly negative) pool documents - # train_pos = train.counts()[1] - # train_negs = train.counts()[0] - # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5)) - # neg_sample = pool.sampling_from_index(neg_idx) - # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample)) - # q.fit(train_augmented) - q.fit(train) - # q.fit(first_train) - # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten() - prev = q.quantify(pool.instances) - return prev, None - # return q.quantify(pool_instances), None - - -def tee(msg): - foo.write(msg + '\n') - foo.flush() - print(msg) - - -datasetname = 'RCV1.C4' -collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname) -nD = len(collection) - -# initial labelled data selection -init_nD = 1000 -init_prev = [0.5, 0.5] -idx = collection.sampling_index(init_nD, *init_prev) -train, pool = split_from_index(collection, idx) -#first_train = LabelledCollection(train.instances, train.labels) - -k = 100 -recall_target = 0.99 - -outputdir = './results' -qp.util.create_if_not_exist(outputdir) - -# sampling_fn, sampling_name = relevance_sampling_index, 'relevance' -sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix' -q_name = NewQuantifier().__class__.__name__ - -experiment_suffix = f'{sampling_name}_{q_name}' - -i = 0 -with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo: - tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC') - while True: - - pool_p_hat_cc, classifier = estimate_prev_CC(train, pool) - pool_p_hat, _ = estimate_prev_Q(train, pool, classifier) - - tr_p = train.prevalence() - te_p = pool.prevalence() - nDtr = len(train) - nDte = len(pool) - - r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte) - r_hat = recall(tr_p, pool_p_hat, nDtr, nDte) - r = recall(tr_p, te_p, nDtr, nDte) - tr_te_shift = qp.error.ae(tr_p, te_p) - - progress = 100 * nDtr / nD - - q_ae = qp.error.ae(te_p, pool_p_hat) - cc_ae = qp.error.ae(te_p, pool_p_hat_cc) - - tee( - f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' - f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}') - - if nDte < k: - break - - top_relevant_idx = sampling_fn(pool, classifier, k) - selected, pool = split_from_index(pool, top_relevant_idx) - train = train + selected - - i += 1 + main(args) diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py index 2d1880d..013a5f0 100644 --- a/eDiscovery/plot.py +++ b/eDiscovery/plot.py @@ -5,16 +5,23 @@ import sys, os, pathlib assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} ' -file = sys.argv[1] #'./results/RCV1.C4.csv' +file = sys.argv[1] loop = bool(int(sys.argv[2])) plotname = pathlib.Path(file).name.replace(".csv", ".png") +if not loop: + plt.rcParams['figure.figsize'] = [12, 12] + plt.rcParams['figure.dpi'] = 200 + # plot the data -fig, axs = plt.subplots(3) +fig, axs = plt.subplots(5) + + try: while True: + aXn = 0 df = pd.read_csv(file, sep='\t') xs = df['it'] @@ -22,29 +29,49 @@ try: y_r = df['R'] y_rhat = df['Rhat'] y_rhatCC = df['RhatCC'] - label='R' - axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$') - axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') - axs[0].plot(xs, y_r, label=label) - axs[0].legend() - axs[0].grid() + axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$') + axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$') + axs[aXn].plot(xs, y_r, label='$R$') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Recall estimation') + axs[aXn].set_ylim(0,1) + aXn+=1 y_r = df['te-prev'] y_rhat = df['te-estim'] y_rhatCC = df['te-estimCC'] - label='P' - axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$') - axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') - axs[1].plot(xs, y_r, label=label) - axs[1].legend() - axs[1].grid() + axs[aXn].plot(xs, y_rhat, label='te-$\hat{Pr}(\oplus)_{Q}$') + axs[aXn].plot(xs, y_rhatCC, label='te-$\hat{Pr}(\oplus)_{CC}$') + axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Prevalence estimation') + aXn += 1 y_ae = df['AE'] y_ae_cc = df['AE_CC'] - axs[2].plot(xs, y_ae, label='AE') - axs[2].plot(xs, y_ae_cc, label='AE-CC') - axs[2].legend() - axs[2].grid() + axs[aXn].plot(xs, y_ae, label='AE$_{Q}$') + axs[aXn].plot(xs, y_ae_cc, label='AE$_{CC}$') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Quantification error') + aXn += 1 + + axs[aXn].plot(xs, df['Shift'], label='tr-te shift (AE)') + axs[aXn].plot(xs, df['tr-prev'], label='tr-$Pr(\oplus)$') + axs[aXn].plot(xs, df['te-prev'], label='te-$Pr(\oplus)$') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Train-Test Shift') + aXn += 1 + + axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$') + axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Classifiers performance') + aXn += 1 os.makedirs('./plots', exist_ok=True) plt.savefig(f'./plots/{plotname}') @@ -53,9 +80,8 @@ try: break else: plt.pause(.5) - axs[0].cla() - axs[1].cla() - axs[2].cla() + for i in range(aXn): + axs[i].cla() except KeyboardInterrupt: print("\n[exit]") \ No newline at end of file diff --git a/quapy/data/base.py b/quapy/data/base.py index b482548..e70c64c 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -143,7 +143,7 @@ class LabelledCollection: else: raise NotImplementedError('unsupported operation for collection types') labels = np.concatenate([self.labels, other.labels]) - return LabelledCollection(join_instances, labels) + return LabelledCollection(join_instances, labels, classes_=self.classes_) @property def Xy(self):