negative results so far

2021-12-06 10:29:46 +01:00 · 2021-12-06 10:29:46 +01:00 · 13fc48ecca
parent 671ef1efea
commit 13fc48ecca
1 changed files with 129 additions and 34 deletions
--- a/eDiscovery/main.py
+++ b/eDiscovery/main.py
@ -1,11 +1,17 @@
 import os.path
 import sys
 import sklearn
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import OneClassSVM, LinearSVC
 import quapy as qp
 from method.base import BaseQuantifier
 from quapy.data import LabelledCollection
-from quapy.method.aggregative import EMQ, ClassifyAndCount, PACC
+from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
 from quapy import functional as F
 import numpy as np
@ -23,57 +29,146 @@ def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator,
    return top_relevant_idx
-def recall(train_prev, pool_prev, train_len, pool_len):
+def negative_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int):
-    nD = train_len + pool_len
+    prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
-    pTr = train_len / nD
+    top_relevant_idx = np.argsort(-prob)[:k]
-    pPool = pool_len / nD
+    return top_relevant_idx
-    recall = train_prev[1] * pTr / (train_prev[1] * pTr + pool_prev[1] * pPool)
+
 def recall(train_prev, pool_prev, train_size, pool_size):
    frac_tr_pos = train_prev[1]
    frac_te_pos = pool_prev[1]
    recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
    return recall
-data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
+def NewClassifier():
    return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
 def create_dataset(datasetname):
    if datasetname=='imdb.10K.75p':
        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
        collection = data.training.sampling(10000, 0.75)
        return collection
    elif datasetname=='RCV1.C4':
        X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
        y = y.toarray()
        prev = y.mean(axis=0).flatten()
        # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
        # this category happens to be the cat with id 4
        target_cat = np.argwhere(np.logical_and(prev>0.1, prev<0.2)).flatten()[0]
        print('chosen cat', target_cat)
        y = y[:, target_cat].flatten()
        return LabelledCollection(X, y)
 def estimate_prev_CC(train, pool):
    q = CC(NewClassifier()).fit(train)
    return q.quantify(pool.instances), q.learner
 def estimate_prev_Q(train, pool, classifier):
    # q = qp.model_selection.GridSearchQ(
    #     ACC(LogisticRegression()),
    #     param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
    #     sample_size=len(train),
    #     protocol='app',
    #     n_prevpoints=21,
    #     n_repetitions=10)
    q = ACC(NewClassifier())
    # borrow (supposedly negative) pool documents
    # train_pos = train.counts()[1]
    # train_negs = train.counts()[0]
    # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
    # neg_sample = pool.sampling_from_index(neg_idx)
    # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
    # q.fit(train_augmented)
    # q.fit(train)
    # q.fit(first_train)
    # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
    prev = q.quantify(pool.instances)
    return prev, None
    # return q.quantify(pool_instances), None
 def tee(msg):
    foo.write(msg+'\n')
    foo.flush()
    print(msg)
 datasetname = 'RCV1.C4'
 collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
 # data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
 # collection = data.training + data.test
-collection = data.training.sampling(10000, 0.75)
+# collection = data.training.sampling(10000, 0.75)
 nD = len(collection)
 # initial labelled data selection
-init_nD = 100
+init_nD = 1000
-init_prev = 0.5
+init_prev = [0.5, 0.5]
-idx = collection.sampling_index(init_nD, init_prev)
+idx = collection.sampling_index(init_nD, *init_prev)
 train, pool = split_from_index(collection, idx)
 first_train = LabelledCollection(train.instances, train.labels)
-k = 50
+k = 25
 recall_target = 0.95
-# Q = EMQ(CalibratedClassifierCV(LogisticRegression()))
+# Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC"
-# Q = ClassifyAndCount(LogisticRegression())
+# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
-Q = PACC(LogisticRegression())
+
 # Q, q_name = qp.model_selection.GridSearchQ(
 #     PACC(LogisticRegression(), val_split=3),
 #     param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']},
 #     sample_size=1000,
 #     protocol='app',
 #     n_prevpoints=21,
 #     n_repetitions=10), "PACC"
 # Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC'
 # CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
 outputdir = './results'
 qp.util.create_if_not_exist(outputdir)
 i = 0
-while True:
+with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo:
-    Q.fit(train)
+    tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
-    pool_p_hat = Q.quantify(pool.instances)
+    while True:
    tr_p = train.prevalence()
    te_p = pool.prevalence()
    nDtr = len(train)
    nDte = len(pool)
-    r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
+        pool_p_hat_cc, classifier = estimate_prev_CC(train, pool)
-    r = recall(tr_p, te_p, nDtr, nDte)
+        pool_p_hat, _ = estimate_prev_Q(train, pool, classifier)
    r_error = abs(r_hat-r)
-    proc_percent = 100*nDtr/nD
+        tr_p = train.prevalence()
        te_p = pool.prevalence()
        nDtr = len(train)
        nDte = len(pool)
-    print(f'{i}\t [{proc_percent:.2f}%] tr-prev={F.strprev(tr_p)} te-prev={F.strprev(te_p)} te-estim={F.strprev(pool_p_hat)} R={r:.3f} Rhat={r_hat:.3f} E={r_error:.3f}')
+        r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte)
-    # if r_hat >= recall_target:
+        r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
-    if proc_percent > 95:
+        r = recall(tr_p, te_p, nDtr, nDte)
-        break
+        tr_te_shift = qp.error.ae(tr_p, te_p)
-    top_relevant_idx = relevance_sampling_index(pool, Q.learner, k)
+        proc_percent = 100*nDtr/nD
    selected, pool = split_from_index(pool, top_relevant_idx)
    train = train + selected
-    i += 1
+        q_ae = qp.error.ae(te_p, pool_p_hat)
        cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
        tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
            f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
        if nDte < k:
            break
        top_relevant_idx = relevance_sampling_index(pool, classifier, k)
        selected, pool = split_from_index(pool, top_relevant_idx)
        train = train + selected
        i += 1