From 13fc48eccac06db3fd8370cc1e5f2f360ec6a089 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 6 Dec 2021 10:29:46 +0100 Subject: [PATCH] negative results so far --- eDiscovery/main.py | 163 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 129 insertions(+), 34 deletions(-) diff --git a/eDiscovery/main.py b/eDiscovery/main.py index 3921e03..e15e8b1 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -1,11 +1,17 @@ +import os.path +import sys + +import sklearn + from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression +from sklearn.svm import OneClassSVM, LinearSVC import quapy as qp from method.base import BaseQuantifier from quapy.data import LabelledCollection -from quapy.method.aggregative import EMQ, ClassifyAndCount, PACC +from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy import functional as F import numpy as np @@ -23,57 +29,146 @@ def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, return top_relevant_idx -def recall(train_prev, pool_prev, train_len, pool_len): - nD = train_len + pool_len - pTr = train_len / nD - pPool = pool_len / nD - recall = train_prev[1] * pTr / (train_prev[1] * pTr + pool_prev[1] * pPool) +def negative_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int): + prob = classifier.predict_proba(pool.instances)[:, 0].flatten() + top_relevant_idx = np.argsort(-prob)[:k] + return top_relevant_idx + + +def recall(train_prev, pool_prev, train_size, pool_size): + frac_tr_pos = train_prev[1] + frac_te_pos = pool_prev[1] + recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size) return recall -data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) +def NewClassifier(): + return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) + + +def create_dataset(datasetname): + if datasetname=='imdb.10K.75p': + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) + collection = data.training.sampling(10000, 0.75) + return collection + + elif datasetname=='RCV1.C4': + X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True) + y = y.toarray() + prev = y.mean(axis=0).flatten() + # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery) + # this category happens to be the cat with id 4 + target_cat = np.argwhere(np.logical_and(prev>0.1, prev<0.2)).flatten()[0] + print('chosen cat', target_cat) + y = y[:, target_cat].flatten() + return LabelledCollection(X, y) + + +def estimate_prev_CC(train, pool): + q = CC(NewClassifier()).fit(train) + return q.quantify(pool.instances), q.learner + + +def estimate_prev_Q(train, pool, classifier): + # q = qp.model_selection.GridSearchQ( + # ACC(LogisticRegression()), + # param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']}, + # sample_size=len(train), + # protocol='app', + # n_prevpoints=21, + # n_repetitions=10) + + q = ACC(NewClassifier()) + # borrow (supposedly negative) pool documents + # train_pos = train.counts()[1] + # train_negs = train.counts()[0] + # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5)) + # neg_sample = pool.sampling_from_index(neg_idx) + # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample)) + # q.fit(train_augmented) + # q.fit(train) + # q.fit(first_train) + # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten() + prev = q.quantify(pool.instances) + return prev, None + # return q.quantify(pool_instances), None + + +def tee(msg): + foo.write(msg+'\n') + foo.flush() + print(msg) + + +datasetname = 'RCV1.C4' +collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname) + +# data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) # collection = data.training + data.test -collection = data.training.sampling(10000, 0.75) +# collection = data.training.sampling(10000, 0.75) nD = len(collection) # initial labelled data selection -init_nD = 100 -init_prev = 0.5 -idx = collection.sampling_index(init_nD, init_prev) +init_nD = 1000 +init_prev = [0.5, 0.5] +idx = collection.sampling_index(init_nD, *init_prev) train, pool = split_from_index(collection, idx) +first_train = LabelledCollection(train.instances, train.labels) -k = 50 +k = 25 recall_target = 0.95 -# Q = EMQ(CalibratedClassifierCV(LogisticRegression())) -# Q = ClassifyAndCount(LogisticRegression()) -Q = PACC(LogisticRegression()) +# Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC" +# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced')) + +# Q, q_name = qp.model_selection.GridSearchQ( +# PACC(LogisticRegression(), val_split=3), +# param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, +# sample_size=1000, +# protocol='app', +# n_prevpoints=21, +# n_repetitions=10), "PACC" +# Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC' +# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced')) + + +outputdir = './results' +qp.util.create_if_not_exist(outputdir) i = 0 -while True: - Q.fit(train) - pool_p_hat = Q.quantify(pool.instances) - tr_p = train.prevalence() - te_p = pool.prevalence() - nDtr = len(train) - nDte = len(pool) +with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo: + tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC') + while True: - r_hat = recall(tr_p, pool_p_hat, nDtr, nDte) - r = recall(tr_p, te_p, nDtr, nDte) - r_error = abs(r_hat-r) + pool_p_hat_cc, classifier = estimate_prev_CC(train, pool) + pool_p_hat, _ = estimate_prev_Q(train, pool, classifier) - proc_percent = 100*nDtr/nD + tr_p = train.prevalence() + te_p = pool.prevalence() + nDtr = len(train) + nDte = len(pool) - print(f'{i}\t [{proc_percent:.2f}%] tr-prev={F.strprev(tr_p)} te-prev={F.strprev(te_p)} te-estim={F.strprev(pool_p_hat)} R={r:.3f} Rhat={r_hat:.3f} E={r_error:.3f}') - # if r_hat >= recall_target: - if proc_percent > 95: - break + r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte) + r_hat = recall(tr_p, pool_p_hat, nDtr, nDte) + r = recall(tr_p, te_p, nDtr, nDte) + tr_te_shift = qp.error.ae(tr_p, te_p) - top_relevant_idx = relevance_sampling_index(pool, Q.learner, k) - selected, pool = split_from_index(pool, top_relevant_idx) - train = train + selected + proc_percent = 100*nDtr/nD - i += 1 + q_ae = qp.error.ae(te_p, pool_p_hat) + cc_ae = qp.error.ae(te_p, pool_p_hat_cc) + + tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' + f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}') + + if nDte < k: + break + + top_relevant_idx = relevance_sampling_index(pool, classifier, k) + selected, pool = split_from_index(pool, top_relevant_idx) + train = train + selected + + i += 1