diff --git a/eDiscovery/experiments.sh b/eDiscovery/experiments.sh index 77c5c08..11eb52e 100755 --- a/eDiscovery/experiments.sh +++ b/eDiscovery/experiments.sh @@ -1,18 +1,34 @@ #!/bin/bash set -x -dataset=RCV1.C4 -iter=50 -k=100 -initsize=1000 -initprev=0.5 -seed=1 - -commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed" - -for Q in PCC ACC PACC EMQ HDy ; do - for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do +#dataset=RCV1.C4 +#iter=50 +#k=100 +#initsize=1000 +#initprev=0.5 +#seed=1 +# +#commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed" +# +#for Q in PCC ACC PACC EMQ HDy ; do +# for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do # PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons - PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0 - done +# PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0 +# done +#done + +dataset=RCV1.C4 +iter=40 +k=100 +initsize=500 +initprev=-1 +seed=1 +Q=RPACC +CLS=lr + +for sampling in relevance_sampling uncertainty_sampling adaptive_sampling mix_sampling ; do + + filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv" + PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed + done diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py index 7bc3354..7d4a3fe 100644 --- a/eDiscovery/functions.py +++ b/eDiscovery/functions.py @@ -6,6 +6,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC, SVC import quapy as qp +from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC import numpy as np @@ -18,7 +19,8 @@ def NewClassifier(classifiername): if classifiername== 'lr': return LogisticRegression(class_weight='balanced') elif classifiername== 'svm': - return SVC(class_weight='balanced', probability=True, kernel='linear') + # return SVC(class_weight='balanced', probability=True, kernel='linear') + return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) def NewQuantifier(quantifiername, classifiername): @@ -30,9 +32,18 @@ def NewQuantifier(quantifiername, classifiername): if quantifiername == 'PCC': return PCC(NewClassifier(classifiername)) if quantifiername == 'ACC': - return ACC(NewClassifier(classifiername), val_split=5) + return ACC(NewClassifier(classifiername), val_split=0.4) if quantifiername == 'PACC': - return PACC(NewClassifier(classifiername), val_split=5) + return PACC(NewClassifier(classifiername), val_split=0.4) + if quantifiername == 'RACC': + return RegionAdjustment(NewClassifier(classifiername), val_split=0.4) + if quantifiername == 'RPACC': + return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10) + if quantifiername == 'GRPACC': + def newQ(): + # return PACC(NewClassifier(classifiername), val_split=0.4) + return EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) + return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') raise ValueError('unknown quantifier', quantifiername) @@ -136,10 +147,7 @@ def create_dataset(datasetname): def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str): q = CC(NewClassifier(classifiername)).fit(train) - # q = NewQuantifier("PCC").fit(train) return q.quantify(pool.instances), q.learner - # small_pool = pool.sampling(100, *pool.prevalence()) - # return q.quantify(small_pool.instances), q.learner def estimate_prev_Q(train, pool, quantifiername, classifiername): @@ -152,21 +160,10 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername): # n_repetitions=10) q = NewQuantifier(quantifiername, classifiername) - # q = ACC(NewClassifier()) - # borrow (supposedly negative) pool documents - # train_pos = train.counts()[1] - # train_negs = train.counts()[0] - # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5)) - # neg_sample = pool.sampling_from_index(neg_idx) - # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample)) - # q.fit(train_augmented) + # q._find_regions((train+pool).instances) q.fit(train) - # q.fit(first_train) - # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten() prev = q.quantify(pool.instances) - return prev, q.learner - # small_pool = pool.sampling(100, *pool.prevalence()) - # return q.quantify(small_pool.instances), q.learner + return prev, None diff --git a/eDiscovery/main.py b/eDiscovery/main.py index b9be957..6581b3d 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -1,9 +1,12 @@ import os.path +import pathlib + from sklearn.metrics import f1_score import functions as fn import quapy as qp import argparse from quapy.data import LabelledCollection +from plot import eDiscoveryPlot def eval_classifier(learner, test:LabelledCollection): @@ -22,15 +25,14 @@ def main(args): init_nD = args.initsize sampling_fn = getattr(fn, args.sampling) max_iterations = args.iter - outputdir = './results' clf_name = args.classifier q_name = args.quantifier - qp.util.create_if_not_exist(outputdir) - collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname) nD = len(collection) + fig = eDiscoveryPlot(args.output) + with qp.util.temp_seed(args.seed): # initial labelled data selection if args.initprev == -1: @@ -42,7 +44,13 @@ def main(args): # recall_target = 0.99 i = 0 - with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo: + + # q = fn.NewQuantifier(q_name, clf_name) + # print('searching regions') + # q._find_regions((train+pool).instances) + # print('[done]') + + with open(args.output, 'wt') as foo: def tee(msg): foo.write(msg + '\n') foo.flush() @@ -54,9 +62,12 @@ def main(args): pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name) pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) + # q.fit(train) + # pool_p_hat_q = q.quantify(pool.instances) + # q_classifier = q.learner f1_clf = eval_classifier(classifier, pool) - f1_q = eval_classifier(q_classifier, pool) + f1_q = 0 #eval_classifier(q_classifier, pool) tr_p = train.prevalence() te_p = pool.prevalence() @@ -76,6 +87,8 @@ def main(args): tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') + fig.plot() + if nDte < k: print('[stop] too few documents remaining') break @@ -112,10 +125,21 @@ if __name__ == '__main__': parser.add_argument('--classifier', metavar='CLS', type=str, help='classifier type (svm, lr)', default='lr') + parser.add_argument('--output', metavar='OUT', type=str, + help="name of the file containing the results of the experiment (default is an automatic " + "filename based on the model's parameters in the folder './results/')", + default=None) args = parser.parse_args() assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)' if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0 args.initprev = int(args.initprev) + if args.output is None: + outputdir = './results' + args.output = os.path.join(outputdir, fn.experiment_name(args)) + else: + outputdir = pathlib.Path(args.output).parent.name + if outputdir: + qp.util.create_if_not_exist(outputdir) main(args) diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py index 563d65c..064d98f 100644 --- a/eDiscovery/plot.py +++ b/eDiscovery/plot.py @@ -1,29 +1,34 @@ import matplotlib.pyplot as plt -import numpy as np import pandas as pd import sys, os, pathlib -assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} ' -file = str(sys.argv[1]) -loop = bool(int(sys.argv[2])) +class eDiscoveryPlot: -print(file) + def __init__(self, datapath, outdir='./plots', loop=True, save=True): + self.outdir = outdir + self.datapath = datapath + self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png") + self.loop = loop + self.save = save -plotname = pathlib.Path(file).name.replace(".csv", ".png") - -if not loop: - plt.rcParams['figure.figsize'] = [12, 12] - plt.rcParams['figure.dpi'] = 200 - -# plot the data -fig, axs = plt.subplots(5) + if not loop: + plt.rcParams['figure.figsize'] = [12, 12] + plt.rcParams['figure.dpi'] = 200 + else: + plt.rcParams['figure.figsize'] = [17, 17] + plt.rcParams['figure.dpi'] = 60 -try: - while True: + # plot the data + self.fig, self.axs = plt.subplots(5) + + def plot(self): + fig, axs = self.fig, self.axs + loop, save = self.loop, self.save + aXn = 0 - df = pd.read_csv(file, sep='\t') + df = pd.read_csv(self.datapath, sep='\t') xs = df['it'] @@ -36,8 +41,8 @@ try: axs[aXn].legend() axs[aXn].grid() axs[aXn].set_ylabel('Recall') - axs[aXn].set_ylim(0,1) - aXn+=1 + axs[aXn].set_ylim(0, 1) + aXn += 1 y_r = df['te-prev'] y_rhat = df['te-estim'] @@ -74,15 +79,27 @@ try: axs[aXn].set_ylabel('Train-Test Shift') aXn += 1 - os.makedirs('./plots', exist_ok=True) - plt.savefig(f'./plots/{plotname}') + if save: + os.makedirs(self.outdir, exist_ok=True) + plt.savefig(f'{self.outdir}/{self.plotname}') - if not loop: - break - else: + if loop: plt.pause(.5) for i in range(aXn): axs[i].cla() -except KeyboardInterrupt: - print("\n[exit]") \ No newline at end of file + +if __name__ == '__main__': + + assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} ' + + file = str(sys.argv[1]) + loop = bool(int(sys.argv[2])) + + figure = eDiscoveryPlot(file) + + try: + figure.plot(loop) + except KeyboardInterrupt: + print('\n[stop]') + diff --git a/eDiscovery/run.sh b/eDiscovery/run.sh index 6760461..f1c1035 100755 --- a/eDiscovery/run.sh +++ b/eDiscovery/run.sh @@ -2,17 +2,17 @@ set -x dataset=RCV1.C4 -iter=50 +iter=100 k=100 initsize=500 initprev=-1 seed=1 -Q=ACC +Q=GRPACC CLS=lr -sampling=proportional_sampling +sampling=relevance_sampling filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv" -PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed & -sleep 2 -PYTHONPATH='.:..' python3 plot.py $filepath 1 +PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed +#sleep 2 +#PYTHONPATH='.:..' python3 plot.py $filepath 1 diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 3df88c1..08ab4c3 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -200,9 +200,9 @@ class ACC(AggregativeQuantifier): # kFCV estimation of parameters y, y_ = [], [] kfcv = StratifiedKFold(n_splits=val_split) - pbar = tqdm(kfcv.split(*data.Xy), total=val_split) - for k, (training_idx, validation_idx) in enumerate(pbar): - pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') + # pbar = tqdm(kfcv.split(*data.Xy), total=val_split) + for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)): + # pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) @@ -289,9 +289,8 @@ class PACC(AggregativeProbabilisticQuantifier): # kFCV estimation of parameters y, y_ = [], [] kfcv = StratifiedKFold(n_splits=val_split) - pbar = tqdm(kfcv.split(*data.Xy), total=val_split) - for k, (training_idx, validation_idx) in enumerate(pbar): - pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') + for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)): + # pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) learner, val_data = training_helper(