diff --git a/eDiscovery/main.py b/eDiscovery/main.py index e15e8b1..c9d096b 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -6,7 +6,7 @@ import sklearn from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression -from sklearn.svm import OneClassSVM, LinearSVC +from sklearn.svm import LinearSVC import quapy as qp from method.base import BaseQuantifier @@ -14,22 +14,38 @@ from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy import functional as F import numpy as np +from itertools import chain -def split_from_index(collection:LabelledCollection, index:np.ndarray): +def split_from_index(collection: LabelledCollection, index: np.ndarray): in_index_set = set(index) out_index_set = set(range(len(collection))) - in_index_set out_index = np.asarray(list(out_index_set), dtype=int) return collection.sampling_from_index(index), collection.sampling_from_index(out_index) -def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int): +def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx -def negative_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int): +def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): + prob = classifier.predict_proba(pool.instances)[:, 1].flatten() + top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k] + return top_uncertain_idx + + +def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): + relevance_idx = relevance_sampling_index(pool, classifier, k) + uncertanty_idx = uncertainty_sampling_index(pool, classifier, k) + interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx)))) + _, unique_idx = np.unique(interleave_idx, return_index=True) + top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k] + return top_interleaved_idx + + +def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): prob = classifier.predict_proba(pool.instances)[:, 0].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx @@ -43,22 +59,27 @@ def recall(train_prev, pool_prev, train_size, pool_size): def NewClassifier(): - return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) + # return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) + return LogisticRegression(class_weight=None) + + +def NewQuantifier(): + return EMQ(CalibratedClassifierCV(NewClassifier())) def create_dataset(datasetname): - if datasetname=='imdb.10K.75p': + if datasetname == 'imdb.10K.75p': data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) collection = data.training.sampling(10000, 0.75) return collection - elif datasetname=='RCV1.C4': + elif datasetname == 'RCV1.C4': X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True) y = y.toarray() prev = y.mean(axis=0).flatten() # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery) # this category happens to be the cat with id 4 - target_cat = np.argwhere(np.logical_and(prev>0.1, prev<0.2)).flatten()[0] + target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0] print('chosen cat', target_cat) y = y[:, target_cat].flatten() return LabelledCollection(X, y) @@ -78,7 +99,8 @@ def estimate_prev_Q(train, pool, classifier): # n_prevpoints=21, # n_repetitions=10) - q = ACC(NewClassifier()) + q = NewQuantifier() + # q = ACC(NewClassifier()) # borrow (supposedly negative) pool documents # train_pos = train.counts()[1] # train_negs = train.counts()[0] @@ -86,7 +108,7 @@ def estimate_prev_Q(train, pool, classifier): # neg_sample = pool.sampling_from_index(neg_idx) # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample)) # q.fit(train_augmented) - # q.fit(train) + q.fit(train) # q.fit(first_train) # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten() prev = q.quantify(pool.instances) @@ -95,17 +117,13 @@ def estimate_prev_Q(train, pool, classifier): def tee(msg): - foo.write(msg+'\n') + foo.write(msg + '\n') foo.flush() print(msg) datasetname = 'RCV1.C4' collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname) - -# data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) -# collection = data.training + data.test -# collection = data.training.sampling(10000, 0.75) nD = len(collection) # initial labelled data selection @@ -113,30 +131,22 @@ init_nD = 1000 init_prev = [0.5, 0.5] idx = collection.sampling_index(init_nD, *init_prev) train, pool = split_from_index(collection, idx) -first_train = LabelledCollection(train.instances, train.labels) - -k = 25 -recall_target = 0.95 - -# Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC" -# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced')) - -# Q, q_name = qp.model_selection.GridSearchQ( -# PACC(LogisticRegression(), val_split=3), -# param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, -# sample_size=1000, -# protocol='app', -# n_prevpoints=21, -# n_repetitions=10), "PACC" -# Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC' -# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced')) +#first_train = LabelledCollection(train.instances, train.labels) +k = 100 +recall_target = 0.99 outputdir = './results' qp.util.create_if_not_exist(outputdir) +# sampling_fn, sampling_name = relevance_sampling_index, 'relevance' +sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix' +q_name = NewQuantifier().__class__.__name__ + +experiment_suffix = f'{sampling_name}_{q_name}' + i = 0 -with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo: +with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo: tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC') while True: @@ -153,23 +163,20 @@ with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo: r = recall(tr_p, te_p, nDtr, nDte) tr_te_shift = qp.error.ae(tr_p, te_p) - proc_percent = 100*nDtr/nD + progress = 100 * nDtr / nD q_ae = qp.error.ae(te_p, pool_p_hat) cc_ae = qp.error.ae(te_p, pool_p_hat_cc) - tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' + tee( + f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}') if nDte < k: break - top_relevant_idx = relevance_sampling_index(pool, classifier, k) + top_relevant_idx = sampling_fn(pool, classifier, k) selected, pool = split_from_index(pool, top_relevant_idx) train = train + selected i += 1 - - - - diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py index 1291cb8..2d1880d 100644 --- a/eDiscovery/plot.py +++ b/eDiscovery/plot.py @@ -1,49 +1,61 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd +import sys, os, pathlib + +assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} ' + +file = sys.argv[1] #'./results/RCV1.C4.csv' +loop = bool(int(sys.argv[2])) + +plotname = pathlib.Path(file).name.replace(".csv", ".png") -file = './results/RCV1.C4.csv' # plot the data fig, axs = plt.subplots(3) -while True: - df = pd.read_csv(file, sep='\t') +try: + while True: + df = pd.read_csv(file, sep='\t') - xs = df['it'] + xs = df['it'] - y_r = df['R'] - y_rhat = df['Rhat'] - y_rhatCC = df['RhatCC'] - label='R' - axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$') - axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') - axs[0].plot(xs, y_r, label=label) - axs[0].legend() - axs[0].grid() + y_r = df['R'] + y_rhat = df['Rhat'] + y_rhatCC = df['RhatCC'] + label='R' + axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$') + axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') + axs[0].plot(xs, y_r, label=label) + axs[0].legend() + axs[0].grid() - y_r = df['te-prev'] - y_rhat = df['te-estim'] - y_rhatCC = df['te-estimCC'] - label='P' - axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$') - axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') - axs[1].plot(xs, y_r, label=label) - axs[1].legend() - axs[1].grid() + y_r = df['te-prev'] + y_rhat = df['te-estim'] + y_rhatCC = df['te-estimCC'] + label='P' + axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$') + axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') + axs[1].plot(xs, y_r, label=label) + axs[1].legend() + axs[1].grid() - y_ae = df['AE'] - y_ae_cc = df['AE_CC'] - axs[2].plot(xs, y_ae, label='AE') - axs[2].plot(xs, y_ae_cc, label='AE-CC') - axs[2].legend() - axs[2].grid() + y_ae = df['AE'] + y_ae_cc = df['AE_CC'] + axs[2].plot(xs, y_ae, label='AE') + axs[2].plot(xs, y_ae_cc, label='AE-CC') + axs[2].legend() + axs[2].grid() - #plt.pause(1.0) - axs[0].cla() - axs[1].cla() - axs[2].cla() + os.makedirs('./plots', exist_ok=True) + plt.savefig(f'./plots/{plotname}') - plt.savefig('./plot.png') - break + if not loop: + break + else: + plt.pause(.5) + axs[0].cla() + axs[1].cla() + axs[2].cla() -#plt.show() +except KeyboardInterrupt: + print("\n[exit]") \ No newline at end of file