diff --git a/eDiscovery/main.py b/eDiscovery/main.py index 99a6811..9a79694 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -5,7 +5,7 @@ from sklearn.metrics import f1_score import functions as fn import quapy as qp import argparse -from data import LabelledCollection +from quapy.data import LabelledCollection def eval_classifier(learner, test:LabelledCollection): @@ -32,7 +32,10 @@ def main(args): with qp.util.temp_seed(args.seed): # initial labelled data selection - idx = collection.sampling_index(init_nD, *init_prev) + if args.initprev == -1: + idx = collection.sampling_index(init_nD) + else: + idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev]) train, pool = fn.split_from_index(collection, idx) first_train = LabelledCollection(train.instances, train.labels) @@ -50,8 +53,8 @@ def main(args): while True: - pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool) - pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier) + pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, args.classifier) + pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier, args.classifier) f1_clf = eval_classifier(classifier, pool) f1_q = eval_classifier(q_classifier, pool) @@ -103,13 +106,18 @@ if __name__=='__main__': parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning', default=1000) parser.add_argument('--initprev', metavar='PREV', type=float, - help='prevalence of the initial sample (-1 for uniform sampling)', - default=0.5) + help='prevalence of the initial sample (-1 for uniform sampling, default)', + default=-1) parser.add_argument('--seed', metavar='SEED', type=int, help='random seed', default=1) + parser.add_argument('--classifier', metavar='CLS', type=str, + help='classifier type (svm, lr)', + default='lr') args = parser.parse_args() - assert 0 < args.initprev < 1, 'wrong value for initsize; should be in (0., 1.)' + assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)' + if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0 + args.initprev = int(args.initprev) main(args) diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py index 013a5f0..563d65c 100644 --- a/eDiscovery/plot.py +++ b/eDiscovery/plot.py @@ -5,9 +5,11 @@ import sys, os, pathlib assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} ' -file = sys.argv[1] +file = str(sys.argv[1]) loop = bool(int(sys.argv[2])) +print(file) + plotname = pathlib.Path(file).name.replace(".csv", ".png") if not loop: @@ -18,7 +20,6 @@ if not loop: fig, axs = plt.subplots(5) - try: while True: aXn = 0 @@ -34,7 +35,7 @@ try: axs[aXn].plot(xs, y_r, label='$R$') axs[aXn].legend() axs[aXn].grid() - axs[aXn].set_ylabel('Recall estimation') + axs[aXn].set_ylabel('Recall') axs[aXn].set_ylim(0,1) aXn+=1 @@ -46,7 +47,7 @@ try: axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$') axs[aXn].legend() axs[aXn].grid() - axs[aXn].set_ylabel('Prevalence estimation') + axs[aXn].set_ylabel('Prevalence') aXn += 1 y_ae = df['AE'] @@ -58,14 +59,6 @@ try: axs[aXn].set_ylabel('Quantification error') aXn += 1 - axs[aXn].plot(xs, df['Shift'], label='tr-te shift (AE)') - axs[aXn].plot(xs, df['tr-prev'], label='tr-$Pr(\oplus)$') - axs[aXn].plot(xs, df['te-prev'], label='te-$Pr(\oplus)$') - axs[aXn].legend() - axs[aXn].grid() - axs[aXn].set_ylabel('Train-Test Shift') - aXn += 1 - axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$') axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$') axs[aXn].legend() @@ -73,6 +66,14 @@ try: axs[aXn].set_ylabel('Classifiers performance') aXn += 1 + axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)') + axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$') + axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Train-Test Shift') + aXn += 1 + os.makedirs('./plots', exist_ok=True) plt.savefig(f'./plots/{plotname}')