negative results so far

This commit is contained in:
Alejandro Moreo Fernandez 2021-12-06 10:29:46 +01:00
parent 671ef1efea
commit 13fc48ecca
1 changed files with 129 additions and 34 deletions

View File

@ -1,11 +1,17 @@
import os.path
import sys
import sklearn
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM, LinearSVC
import quapy as qp
from method.base import BaseQuantifier
from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, ClassifyAndCount, PACC
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
from quapy import functional as F
import numpy as np
@ -23,57 +29,146 @@ def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator,
return top_relevant_idx
def recall(train_prev, pool_prev, train_len, pool_len):
nD = train_len + pool_len
pTr = train_len / nD
pPool = pool_len / nD
recall = train_prev[1] * pTr / (train_prev[1] * pTr + pool_prev[1] * pPool)
def negative_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int):
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
top_relevant_idx = np.argsort(-prob)[:k]
return top_relevant_idx
def recall(train_prev, pool_prev, train_size, pool_size):
frac_tr_pos = train_prev[1]
frac_te_pos = pool_prev[1]
recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
return recall
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
def NewClassifier():
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
def create_dataset(datasetname):
if datasetname=='imdb.10K.75p':
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
collection = data.training.sampling(10000, 0.75)
return collection
elif datasetname=='RCV1.C4':
X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
y = y.toarray()
prev = y.mean(axis=0).flatten()
# choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
# this category happens to be the cat with id 4
target_cat = np.argwhere(np.logical_and(prev>0.1, prev<0.2)).flatten()[0]
print('chosen cat', target_cat)
y = y[:, target_cat].flatten()
return LabelledCollection(X, y)
def estimate_prev_CC(train, pool):
q = CC(NewClassifier()).fit(train)
return q.quantify(pool.instances), q.learner
def estimate_prev_Q(train, pool, classifier):
# q = qp.model_selection.GridSearchQ(
# ACC(LogisticRegression()),
# param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
# sample_size=len(train),
# protocol='app',
# n_prevpoints=21,
# n_repetitions=10)
q = ACC(NewClassifier())
# borrow (supposedly negative) pool documents
# train_pos = train.counts()[1]
# train_negs = train.counts()[0]
# neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
# neg_sample = pool.sampling_from_index(neg_idx)
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
# q.fit(train_augmented)
# q.fit(train)
# q.fit(first_train)
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
prev = q.quantify(pool.instances)
return prev, None
# return q.quantify(pool_instances), None
def tee(msg):
foo.write(msg+'\n')
foo.flush()
print(msg)
datasetname = 'RCV1.C4'
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
# data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
# collection = data.training + data.test
collection = data.training.sampling(10000, 0.75)
# collection = data.training.sampling(10000, 0.75)
nD = len(collection)
# initial labelled data selection
init_nD = 100
init_prev = 0.5
idx = collection.sampling_index(init_nD, init_prev)
init_nD = 1000
init_prev = [0.5, 0.5]
idx = collection.sampling_index(init_nD, *init_prev)
train, pool = split_from_index(collection, idx)
first_train = LabelledCollection(train.instances, train.labels)
k = 50
k = 25
recall_target = 0.95
# Q = EMQ(CalibratedClassifierCV(LogisticRegression()))
# Q = ClassifyAndCount(LogisticRegression())
Q = PACC(LogisticRegression())
# Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC"
# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
# Q, q_name = qp.model_selection.GridSearchQ(
# PACC(LogisticRegression(), val_split=3),
# param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']},
# sample_size=1000,
# protocol='app',
# n_prevpoints=21,
# n_repetitions=10), "PACC"
# Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC'
# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
outputdir = './results'
qp.util.create_if_not_exist(outputdir)
i = 0
while True:
Q.fit(train)
pool_p_hat = Q.quantify(pool.instances)
tr_p = train.prevalence()
te_p = pool.prevalence()
nDtr = len(train)
nDte = len(pool)
with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo:
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
while True:
r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
r = recall(tr_p, te_p, nDtr, nDte)
r_error = abs(r_hat-r)
pool_p_hat_cc, classifier = estimate_prev_CC(train, pool)
pool_p_hat, _ = estimate_prev_Q(train, pool, classifier)
proc_percent = 100*nDtr/nD
tr_p = train.prevalence()
te_p = pool.prevalence()
nDtr = len(train)
nDte = len(pool)
print(f'{i}\t [{proc_percent:.2f}%] tr-prev={F.strprev(tr_p)} te-prev={F.strprev(te_p)} te-estim={F.strprev(pool_p_hat)} R={r:.3f} Rhat={r_hat:.3f} E={r_error:.3f}')
# if r_hat >= recall_target:
if proc_percent > 95:
break
r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte)
r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
r = recall(tr_p, te_p, nDtr, nDte)
tr_te_shift = qp.error.ae(tr_p, te_p)
top_relevant_idx = relevance_sampling_index(pool, Q.learner, k)
selected, pool = split_from_index(pool, top_relevant_idx)
train = train + selected
proc_percent = 100*nDtr/nD
i += 1
q_ae = qp.error.ae(te_p, pool_p_hat)
cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
if nDte < k:
break
top_relevant_idx = relevance_sampling_index(pool, classifier, k)
selected, pool = split_from_index(pool, top_relevant_idx)
train = train + selected
i += 1