1
0
Fork 0

refactoring main, argparse, etc

This commit is contained in:
Alejandro Moreo Fernandez 2022-01-17 17:57:14 +01:00
parent b051ed4781
commit 833476ebf8
3 changed files with 151 additions and 192 deletions

View File

@ -1,182 +1,115 @@
import os.path import os.path
import sys
import sklearn from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import functions as fn
import quapy as qp import quapy as qp
from method.base import BaseQuantifier import argparse
from quapy.data import LabelledCollection from data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
from quapy import functional as F
import numpy as np
from itertools import chain
def split_from_index(collection: LabelledCollection, index: np.ndarray): def eval_classifier(learner, test:LabelledCollection):
in_index_set = set(index) predictions = learner.predict(test.instances)
out_index_set = set(range(len(collection))) - in_index_set true_labels = test.labels
out_index = np.asarray(list(out_index_set), dtype=int) # f1 = f1_score(true_labels, predictions, average='macro')
return collection.sampling_from_index(index), collection.sampling_from_index(out_index) f1 = f1_score(true_labels, predictions, average='binary')
# f1 = (true_labels==predictions).mean()
return f1
def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): def main(args):
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
top_relevant_idx = np.argsort(-prob)[:k] datasetname = args.dataset
return top_relevant_idx k = args.k
init_nD = args.initsize
init_prev = [1-args.initprev, args.initprev]
sampling_fn = getattr(fn, args.sampling)
max_iterations = args.iter
outputdir = './results'
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
nD = len(collection)
with qp.util.temp_seed(args.seed):
# initial labelled data selection
idx = collection.sampling_index(init_nD, *init_prev)
train, pool = fn.split_from_index(collection, idx)
first_train = LabelledCollection(train.instances, train.labels)
# recall_target = 0.99
qp.util.create_if_not_exist(outputdir)
i = 0
with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo:
def tee(msg):
foo.write(msg + '\n')
foo.flush()
print(msg)
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf')
while True:
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool)
pool_p_hat, q_classifier = fn.estimate_prev_Q(train, pool, args.quantifier)
f1_clf = eval_classifier(classifier, pool)
f1_q = eval_classifier(q_classifier, pool)
tr_p = train.prevalence()
te_p = pool.prevalence()
nDtr = len(train)
nDte = len(pool)
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
r_hat = fn.recall(tr_p, pool_p_hat, nDtr, nDte)
r = fn.recall(tr_p, te_p, nDtr, nDte)
tr_te_shift = qp.error.ae(tr_p, te_p)
progress = 100 * nDtr / nD
q_ae = qp.error.ae(te_p, pool_p_hat)
cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
if nDte < k:
print('[stop] too few documents remaining')
break
elif i+1 == max_iterations:
print('[stop] maximum number of iterations reached')
break
top_relevant_idx = sampling_fn(pool, classifier, k, progress)
selected, pool = fn.split_from_index(pool, top_relevant_idx)
train = train + selected
i += 1
def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): if __name__=='__main__':
prob = classifier.predict_proba(pool.instances)[:, 1].flatten() parser = argparse.ArgumentParser(description='e-Discovery')
top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k] parser.add_argument('--dataset', metavar='DATASET', type=str, help='Dataset name',
return top_uncertain_idx default='RCV1.C4')
parser.add_argument('--quantifier', metavar='METHOD', type=str, help='Quantification method',
default='EMQ')
parser.add_argument('--sampling', metavar='SAMPLING', type=str, help='Sampling criterion',
default='relevance_sampling')
parser.add_argument('--iter', metavar='INT', type=int, help='number of iterations (-1 to set no limit)',
default=-1)
parser.add_argument('--k', metavar='BATCH', type=int, help='number of documents in a batch',
default=100)
parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
default=1000)
parser.add_argument('--initprev', metavar='PREV', type=float,
help='prevalence of the initial sample (-1 for uniform sampling)',
default=0.5)
parser.add_argument('--seed', metavar='SEED', type=int,
help='random seed',
default=1)
args = parser.parse_args()
assert 0 < args.initprev < 1, 'wrong value for initsize; should be in (0., 1.)'
def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): main(args)
relevance_idx = relevance_sampling_index(pool, classifier, k)
uncertanty_idx = uncertainty_sampling_index(pool, classifier, k)
interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
_, unique_idx = np.unique(interleave_idx, return_index=True)
top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
return top_interleaved_idx
def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
top_relevant_idx = np.argsort(-prob)[:k]
return top_relevant_idx
def recall(train_prev, pool_prev, train_size, pool_size):
frac_tr_pos = train_prev[1]
frac_te_pos = pool_prev[1]
recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
return recall
def NewClassifier():
# return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
return LogisticRegression(class_weight=None)
def NewQuantifier():
return EMQ(CalibratedClassifierCV(NewClassifier()))
def create_dataset(datasetname):
if datasetname == 'imdb.10K.75p':
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
collection = data.training.sampling(10000, 0.75)
return collection
elif datasetname == 'RCV1.C4':
X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
y = y.toarray()
prev = y.mean(axis=0).flatten()
# choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
# this category happens to be the cat with id 4
target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0]
print('chosen cat', target_cat)
y = y[:, target_cat].flatten()
return LabelledCollection(X, y)
def estimate_prev_CC(train, pool):
q = CC(NewClassifier()).fit(train)
return q.quantify(pool.instances), q.learner
def estimate_prev_Q(train, pool, classifier):
# q = qp.model_selection.GridSearchQ(
# ACC(LogisticRegression()),
# param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
# sample_size=len(train),
# protocol='app',
# n_prevpoints=21,
# n_repetitions=10)
q = NewQuantifier()
# q = ACC(NewClassifier())
# borrow (supposedly negative) pool documents
# train_pos = train.counts()[1]
# train_negs = train.counts()[0]
# neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
# neg_sample = pool.sampling_from_index(neg_idx)
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
# q.fit(train_augmented)
q.fit(train)
# q.fit(first_train)
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
prev = q.quantify(pool.instances)
return prev, None
# return q.quantify(pool_instances), None
def tee(msg):
foo.write(msg + '\n')
foo.flush()
print(msg)
datasetname = 'RCV1.C4'
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
nD = len(collection)
# initial labelled data selection
init_nD = 1000
init_prev = [0.5, 0.5]
idx = collection.sampling_index(init_nD, *init_prev)
train, pool = split_from_index(collection, idx)
#first_train = LabelledCollection(train.instances, train.labels)
k = 100
recall_target = 0.99
outputdir = './results'
qp.util.create_if_not_exist(outputdir)
# sampling_fn, sampling_name = relevance_sampling_index, 'relevance'
sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix'
q_name = NewQuantifier().__class__.__name__
experiment_suffix = f'{sampling_name}_{q_name}'
i = 0
with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo:
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
while True:
pool_p_hat_cc, classifier = estimate_prev_CC(train, pool)
pool_p_hat, _ = estimate_prev_Q(train, pool, classifier)
tr_p = train.prevalence()
te_p = pool.prevalence()
nDtr = len(train)
nDte = len(pool)
r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte)
r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
r = recall(tr_p, te_p, nDtr, nDte)
tr_te_shift = qp.error.ae(tr_p, te_p)
progress = 100 * nDtr / nD
q_ae = qp.error.ae(te_p, pool_p_hat)
cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
tee(
f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
if nDte < k:
break
top_relevant_idx = sampling_fn(pool, classifier, k)
selected, pool = split_from_index(pool, top_relevant_idx)
train = train + selected
i += 1

View File

@ -5,16 +5,23 @@ import sys, os, pathlib
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>' assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = sys.argv[1] #'./results/RCV1.C4.csv' file = sys.argv[1]
loop = bool(int(sys.argv[2])) loop = bool(int(sys.argv[2]))
plotname = pathlib.Path(file).name.replace(".csv", ".png") plotname = pathlib.Path(file).name.replace(".csv", ".png")
if not loop:
plt.rcParams['figure.figsize'] = [12, 12]
plt.rcParams['figure.dpi'] = 200
# plot the data # plot the data
fig, axs = plt.subplots(3) fig, axs = plt.subplots(5)
try: try:
while True: while True:
aXn = 0
df = pd.read_csv(file, sep='\t') df = pd.read_csv(file, sep='\t')
xs = df['it'] xs = df['it']
@ -22,29 +29,49 @@ try:
y_r = df['R'] y_r = df['R']
y_rhat = df['Rhat'] y_rhat = df['Rhat']
y_rhatCC = df['RhatCC'] y_rhatCC = df['RhatCC']
label='R' axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$') axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') axs[aXn].plot(xs, y_r, label='$R$')
axs[0].plot(xs, y_r, label=label) axs[aXn].legend()
axs[0].legend() axs[aXn].grid()
axs[0].grid() axs[aXn].set_ylabel('Recall estimation')
axs[aXn].set_ylim(0,1)
aXn+=1
y_r = df['te-prev'] y_r = df['te-prev']
y_rhat = df['te-estim'] y_rhat = df['te-estim']
y_rhatCC = df['te-estimCC'] y_rhatCC = df['te-estimCC']
label='P' axs[aXn].plot(xs, y_rhat, label='te-$\hat{Pr}(\oplus)_{Q}$')
axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$') axs[aXn].plot(xs, y_rhatCC, label='te-$\hat{Pr}(\oplus)_{CC}$')
axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$') axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
axs[1].plot(xs, y_r, label=label) axs[aXn].legend()
axs[1].legend() axs[aXn].grid()
axs[1].grid() axs[aXn].set_ylabel('Prevalence estimation')
aXn += 1
y_ae = df['AE'] y_ae = df['AE']
y_ae_cc = df['AE_CC'] y_ae_cc = df['AE_CC']
axs[2].plot(xs, y_ae, label='AE') axs[aXn].plot(xs, y_ae, label='AE$_{Q}$')
axs[2].plot(xs, y_ae_cc, label='AE-CC') axs[aXn].plot(xs, y_ae_cc, label='AE$_{CC}$')
axs[2].legend() axs[aXn].legend()
axs[2].grid() axs[aXn].grid()
axs[aXn].set_ylabel('Quantification error')
aXn += 1
axs[aXn].plot(xs, df['Shift'], label='tr-te shift (AE)')
axs[aXn].plot(xs, df['tr-prev'], label='tr-$Pr(\oplus)$')
axs[aXn].plot(xs, df['te-prev'], label='te-$Pr(\oplus)$')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Train-Test Shift')
aXn += 1
axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Classifiers performance')
aXn += 1
os.makedirs('./plots', exist_ok=True) os.makedirs('./plots', exist_ok=True)
plt.savefig(f'./plots/{plotname}') plt.savefig(f'./plots/{plotname}')
@ -53,9 +80,8 @@ try:
break break
else: else:
plt.pause(.5) plt.pause(.5)
axs[0].cla() for i in range(aXn):
axs[1].cla() axs[i].cla()
axs[2].cla()
except KeyboardInterrupt: except KeyboardInterrupt:
print("\n[exit]") print("\n[exit]")

View File

@ -143,7 +143,7 @@ class LabelledCollection:
else: else:
raise NotImplementedError('unsupported operation for collection types') raise NotImplementedError('unsupported operation for collection types')
labels = np.concatenate([self.labels, other.labels]) labels = np.concatenate([self.labels, other.labels])
return LabelledCollection(join_instances, labels) return LabelledCollection(join_instances, labels, classes_=self.classes_)
@property @property
def Xy(self): def Xy(self):