From a3a5bd8da02736f41a4ad50a3fda57211c315f83 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Wed, 19 Jan 2022 09:53:07 +0100
Subject: [PATCH] more testing

---
 eDiscovery/main.py | 22 +++++++++++++++-------
 eDiscovery/plot.py | 25 +++++++++++++------------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/eDiscovery/main.py b/eDiscovery/main.py
index 99a6811..9a79694 100644
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@@ -5,7 +5,7 @@ from sklearn.metrics import f1_score
 import functions as fn
 import quapy as qp
 import argparse
-from data import LabelledCollection
+from quapy.data import LabelledCollection
 
 
 def eval_classifier(learner, test:LabelledCollection):
@@ -32,7 +32,10 @@ def main(args):
 
     with qp.util.temp_seed(args.seed):
         # initial labelled data selection
-        idx = collection.sampling_index(init_nD, *init_prev)
+        if args.initprev == -1:
+            idx = collection.sampling_index(init_nD)
+        else:
+            idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev])
         train, pool = fn.split_from_index(collection, idx)
         first_train = LabelledCollection(train.instances, train.labels)
 
@@ -50,8 +53,8 @@ def main(args):
 
             while True:
 
-                pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool)
-                pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier)
+                pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, args.classifier)
+                pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier, args.classifier)
 
                 f1_clf = eval_classifier(classifier, pool)
                 f1_q = eval_classifier(q_classifier, pool)
@@ -103,13 +106,18 @@ if __name__=='__main__':
     parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
                         default=1000)
     parser.add_argument('--initprev', metavar='PREV', type=float,
-                        help='prevalence of the initial sample (-1 for uniform sampling)',
-                        default=0.5)
+                        help='prevalence of the initial sample (-1 for uniform sampling, default)',
+                        default=-1)
     parser.add_argument('--seed', metavar='SEED', type=int,
                         help='random seed',
                         default=1)
+    parser.add_argument('--classifier', metavar='CLS', type=str,
+                        help='classifier type (svm, lr)',
+                        default='lr')
     args = parser.parse_args()
 
-    assert 0 < args.initprev < 1, 'wrong value for initsize; should be in (0., 1.)'
+    assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
+    if args.initprev==-1:  # this is to clean the path, to show initprev:-1 and not initprev:-1.0
+        args.initprev = int(args.initprev)
 
     main(args)
diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py
index 013a5f0..563d65c 100644
--- a/eDiscovery/plot.py
+++ b/eDiscovery/plot.py
@@ -5,9 +5,11 @@ import sys, os, pathlib
 
 assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
 
-file = sys.argv[1]
+file = str(sys.argv[1])
 loop = bool(int(sys.argv[2]))
 
+print(file)
+
 plotname = pathlib.Path(file).name.replace(".csv", ".png")
 
 if not loop:
@@ -18,7 +20,6 @@ if not loop:
 fig, axs = plt.subplots(5)
 
 
-
 try:
     while True:
         aXn = 0
@@ -34,7 +35,7 @@ try:
         axs[aXn].plot(xs, y_r, label='$R$')
         axs[aXn].legend()
         axs[aXn].grid()
-        axs[aXn].set_ylabel('Recall estimation')
+        axs[aXn].set_ylabel('Recall')
         axs[aXn].set_ylim(0,1)
         aXn+=1
 
@@ -46,7 +47,7 @@ try:
         axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
         axs[aXn].legend()
         axs[aXn].grid()
-        axs[aXn].set_ylabel('Prevalence estimation')
+        axs[aXn].set_ylabel('Prevalence')
         aXn += 1
 
         y_ae = df['AE']
@@ -58,14 +59,6 @@ try:
         axs[aXn].set_ylabel('Quantification error')
         aXn += 1
 
-        axs[aXn].plot(xs, df['Shift'], label='tr-te shift (AE)')
-        axs[aXn].plot(xs, df['tr-prev'], label='tr-$Pr(\oplus)$')
-        axs[aXn].plot(xs, df['te-prev'], label='te-$Pr(\oplus)$')
-        axs[aXn].legend()
-        axs[aXn].grid()
-        axs[aXn].set_ylabel('Train-Test Shift')
-        aXn += 1
-
         axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
         axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
         axs[aXn].legend()
@@ -73,6 +66,14 @@ try:
         axs[aXn].set_ylabel('Classifiers performance')
         aXn += 1
 
+        axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)')
+        axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
+        axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
+        axs[aXn].legend()
+        axs[aXn].grid()
+        axs[aXn].set_ylabel('Train-Test Shift')
+        aXn += 1
+
         os.makedirs('./plots', exist_ok=True)
         plt.savefig(f'./plots/{plotname}')