more testing

2022-01-19 09:53:07 +01:00 · 2022-01-19 09:53:07 +01:00 · a3a5bd8da0
parent 833476ebf8
commit a3a5bd8da0
2 changed files with 28 additions and 19 deletions
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@ -5,7 +5,7 @@ from sklearn.metrics import f1_score
 import functions as fn
 import quapy as qp
 import argparse
-from data import LabelledCollection
+from quapy.data import LabelledCollection
 def eval_classifier(learner, test:LabelledCollection):
@ -32,7 +32,10 @@ def main(args):
    with qp.util.temp_seed(args.seed):
        # initial labelled data selection
-        idx = collection.sampling_index(init_nD, *init_prev)
+        if args.initprev == -1:
            idx = collection.sampling_index(init_nD)
        else:
            idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev])
        train, pool = fn.split_from_index(collection, idx)
        first_train = LabelledCollection(train.instances, train.labels)
@ -50,8 +53,8 @@ def main(args):
            while True:
-                pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool)
+                pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, args.classifier)
-                pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier)
+                pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier, args.classifier)
                f1_clf = eval_classifier(classifier, pool)
                f1_q = eval_classifier(q_classifier, pool)
@ -103,13 +106,18 @@ if __name__=='__main__':
    parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
                        default=1000)
    parser.add_argument('--initprev', metavar='PREV', type=float,
-                        help='prevalence of the initial sample (-1 for uniform sampling)',
+                        help='prevalence of the initial sample (-1 for uniform sampling, default)',
-                        default=0.5)
+                        default=-1)
    parser.add_argument('--seed', metavar='SEED', type=int,
                        help='random seed',
                        default=1)
    parser.add_argument('--classifier', metavar='CLS', type=str,
                        help='classifier type (svm, lr)',
                        default='lr')
    args = parser.parse_args()
-    assert 0 < args.initprev < 1, 'wrong value for initsize; should be in (0., 1.)'
+    assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
    if args.initprev==-1:  # this is to clean the path, to show initprev:-1 and not initprev:-1.0
        args.initprev = int(args.initprev)
    main(args)
--- a/eDiscovery/plot.py
+++ b/eDiscovery/plot.py
@ -5,9 +5,11 @@ import sys, os, pathlib
 assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
-file = sys.argv[1]
+file = str(sys.argv[1])
 loop = bool(int(sys.argv[2]))
 print(file)
 plotname = pathlib.Path(file).name.replace(".csv", ".png")
 if not loop:
@ -18,7 +20,6 @@ if not loop:
 fig, axs = plt.subplots(5)
 try:
    while True:
        aXn = 0
@ -34,7 +35,7 @@ try:
        axs[aXn].plot(xs, y_r, label='$R$')
        axs[aXn].legend()
        axs[aXn].grid()
-        axs[aXn].set_ylabel('Recall estimation')
+        axs[aXn].set_ylabel('Recall')
        axs[aXn].set_ylim(0,1)
        aXn+=1
@ -46,7 +47,7 @@ try:
        axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
        axs[aXn].legend()
        axs[aXn].grid()
-        axs[aXn].set_ylabel('Prevalence estimation')
+        axs[aXn].set_ylabel('Prevalence')
        aXn += 1
        y_ae = df['AE']
@ -58,14 +59,6 @@ try:
        axs[aXn].set_ylabel('Quantification error')
        aXn += 1
        axs[aXn].plot(xs, df['Shift'], label='tr-te shift (AE)')
        axs[aXn].plot(xs, df['tr-prev'], label='tr-$Pr(\oplus)$')
        axs[aXn].plot(xs, df['te-prev'], label='te-$Pr(\oplus)$')
        axs[aXn].legend()
        axs[aXn].grid()
        axs[aXn].set_ylabel('Train-Test Shift')
        aXn += 1
        axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
        axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
        axs[aXn].legend()
@ -73,6 +66,14 @@ try:
        axs[aXn].set_ylabel('Classifiers performance')
        aXn += 1
        axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)')
        axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
        axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
        axs[aXn].legend()
        axs[aXn].grid()
        axs[aXn].set_ylabel('Train-Test Shift')
        aXn += 1
        os.makedirs('./plots', exist_ok=True)
        plt.savefig(f'./plots/{plotname}')