going into region quantification

This commit is contained in:
Alejandro Moreo Fernandez 2022-01-21 09:46:30 +01:00
parent 14dbfb567b
commit 3aed410722
6 changed files with 127 additions and 74 deletions

View File

@ -1,18 +1,34 @@
#!/bin/bash
set -x
dataset=RCV1.C4
iter=50
k=100
initsize=1000
initprev=0.5
seed=1
commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
for Q in PCC ACC PACC EMQ HDy ; do
for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
#dataset=RCV1.C4
#iter=50
#k=100
#initsize=1000
#initprev=0.5
#seed=1
#
#commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
#
#for Q in PCC ACC PACC EMQ HDy ; do
# for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
# PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons
PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
done
# PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
# done
#done
dataset=RCV1.C4
iter=40
k=100
initsize=500
initprev=-1
seed=1
Q=RPACC
CLS=lr
for sampling in relevance_sampling uncertainty_sampling adaptive_sampling mix_sampling ; do
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
done

View File

@ -6,6 +6,7 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
import quapy as qp
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal
from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
import numpy as np
@ -18,7 +19,8 @@ def NewClassifier(classifiername):
if classifiername== 'lr':
return LogisticRegression(class_weight='balanced')
elif classifiername== 'svm':
return SVC(class_weight='balanced', probability=True, kernel='linear')
# return SVC(class_weight='balanced', probability=True, kernel='linear')
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
def NewQuantifier(quantifiername, classifiername):
@ -30,9 +32,18 @@ def NewQuantifier(quantifiername, classifiername):
if quantifiername == 'PCC':
return PCC(NewClassifier(classifiername))
if quantifiername == 'ACC':
return ACC(NewClassifier(classifiername), val_split=5)
return ACC(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'PACC':
return PACC(NewClassifier(classifiername), val_split=5)
return PACC(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'RACC':
return RegionAdjustment(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'RPACC':
return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10)
if quantifiername == 'GRPACC':
def newQ():
# return PACC(NewClassifier(classifiername), val_split=0.4)
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
raise ValueError('unknown quantifier', quantifiername)
@ -136,10 +147,7 @@ def create_dataset(datasetname):
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
q = CC(NewClassifier(classifiername)).fit(train)
# q = NewQuantifier("PCC").fit(train)
return q.quantify(pool.instances), q.learner
# small_pool = pool.sampling(100, *pool.prevalence())
# return q.quantify(small_pool.instances), q.learner
def estimate_prev_Q(train, pool, quantifiername, classifiername):
@ -152,21 +160,10 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
# n_repetitions=10)
q = NewQuantifier(quantifiername, classifiername)
# q = ACC(NewClassifier())
# borrow (supposedly negative) pool documents
# train_pos = train.counts()[1]
# train_negs = train.counts()[0]
# neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
# neg_sample = pool.sampling_from_index(neg_idx)
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
# q.fit(train_augmented)
# q._find_regions((train+pool).instances)
q.fit(train)
# q.fit(first_train)
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
prev = q.quantify(pool.instances)
return prev, q.learner
# small_pool = pool.sampling(100, *pool.prevalence())
# return q.quantify(small_pool.instances), q.learner
return prev, None

View File

@ -1,9 +1,12 @@
import os.path
import pathlib
from sklearn.metrics import f1_score
import functions as fn
import quapy as qp
import argparse
from quapy.data import LabelledCollection
from plot import eDiscoveryPlot
def eval_classifier(learner, test:LabelledCollection):
@ -22,15 +25,14 @@ def main(args):
init_nD = args.initsize
sampling_fn = getattr(fn, args.sampling)
max_iterations = args.iter
outputdir = './results'
clf_name = args.classifier
q_name = args.quantifier
qp.util.create_if_not_exist(outputdir)
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
nD = len(collection)
fig = eDiscoveryPlot(args.output)
with qp.util.temp_seed(args.seed):
# initial labelled data selection
if args.initprev == -1:
@ -42,7 +44,13 @@ def main(args):
# recall_target = 0.99
i = 0
with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo:
# q = fn.NewQuantifier(q_name, clf_name)
# print('searching regions')
# q._find_regions((train+pool).instances)
# print('[done]')
with open(args.output, 'wt') as foo:
def tee(msg):
foo.write(msg + '\n')
foo.flush()
@ -54,9 +62,12 @@ def main(args):
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
# q.fit(train)
# pool_p_hat_q = q.quantify(pool.instances)
# q_classifier = q.learner
f1_clf = eval_classifier(classifier, pool)
f1_q = eval_classifier(q_classifier, pool)
f1_q = 0 #eval_classifier(q_classifier, pool)
tr_p = train.prevalence()
te_p = pool.prevalence()
@ -76,6 +87,8 @@ def main(args):
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
fig.plot()
if nDte < k:
print('[stop] too few documents remaining')
break
@ -112,10 +125,21 @@ if __name__ == '__main__':
parser.add_argument('--classifier', metavar='CLS', type=str,
help='classifier type (svm, lr)',
default='lr')
parser.add_argument('--output', metavar='OUT', type=str,
help="name of the file containing the results of the experiment (default is an automatic "
"filename based on the model's parameters in the folder './results/')",
default=None)
args = parser.parse_args()
assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0
args.initprev = int(args.initprev)
if args.output is None:
outputdir = './results'
args.output = os.path.join(outputdir, fn.experiment_name(args))
else:
outputdir = pathlib.Path(args.output).parent.name
if outputdir:
qp.util.create_if_not_exist(outputdir)
main(args)

View File

@ -1,29 +1,34 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys, os, pathlib
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = str(sys.argv[1])
loop = bool(int(sys.argv[2]))
class eDiscoveryPlot:
print(file)
def __init__(self, datapath, outdir='./plots', loop=True, save=True):
self.outdir = outdir
self.datapath = datapath
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
self.loop = loop
self.save = save
plotname = pathlib.Path(file).name.replace(".csv", ".png")
if not loop:
plt.rcParams['figure.figsize'] = [12, 12]
plt.rcParams['figure.dpi'] = 200
# plot the data
fig, axs = plt.subplots(5)
if not loop:
plt.rcParams['figure.figsize'] = [12, 12]
plt.rcParams['figure.dpi'] = 200
else:
plt.rcParams['figure.figsize'] = [17, 17]
plt.rcParams['figure.dpi'] = 60
try:
while True:
# plot the data
self.fig, self.axs = plt.subplots(5)
def plot(self):
fig, axs = self.fig, self.axs
loop, save = self.loop, self.save
aXn = 0
df = pd.read_csv(file, sep='\t')
df = pd.read_csv(self.datapath, sep='\t')
xs = df['it']
@ -36,8 +41,8 @@ try:
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Recall')
axs[aXn].set_ylim(0,1)
aXn+=1
axs[aXn].set_ylim(0, 1)
aXn += 1
y_r = df['te-prev']
y_rhat = df['te-estim']
@ -74,15 +79,27 @@ try:
axs[aXn].set_ylabel('Train-Test Shift')
aXn += 1
os.makedirs('./plots', exist_ok=True)
plt.savefig(f'./plots/{plotname}')
if save:
os.makedirs(self.outdir, exist_ok=True)
plt.savefig(f'{self.outdir}/{self.plotname}')
if not loop:
break
else:
if loop:
plt.pause(.5)
for i in range(aXn):
axs[i].cla()
except KeyboardInterrupt:
print("\n[exit]")
if __name__ == '__main__':
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = str(sys.argv[1])
loop = bool(int(sys.argv[2]))
figure = eDiscoveryPlot(file)
try:
figure.plot(loop)
except KeyboardInterrupt:
print('\n[stop]')

View File

@ -2,17 +2,17 @@
set -x
dataset=RCV1.C4
iter=50
iter=100
k=100
initsize=500
initprev=-1
seed=1
Q=ACC
Q=GRPACC
CLS=lr
sampling=proportional_sampling
sampling=relevance_sampling
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed &
sleep 2
PYTHONPATH='.:..' python3 plot.py $filepath 1
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
#sleep 2
#PYTHONPATH='.:..' python3 plot.py $filepath 1

View File

@ -200,9 +200,9 @@ class ACC(AggregativeQuantifier):
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
# pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
@ -289,9 +289,8 @@ class PACC(AggregativeProbabilisticQuantifier):
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(