1
0
Fork 0

going into region quantification

This commit is contained in:
Alejandro Moreo Fernandez 2022-01-21 09:46:30 +01:00
parent 14dbfb567b
commit 3aed410722
6 changed files with 127 additions and 74 deletions

View File

@ -1,18 +1,34 @@
#!/bin/bash #!/bin/bash
set -x set -x
dataset=RCV1.C4 #dataset=RCV1.C4
iter=50 #iter=50
k=100 #k=100
initsize=1000 #initsize=1000
initprev=0.5 #initprev=0.5
seed=1 #seed=1
#
commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed" #commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
#
for Q in PCC ACC PACC EMQ HDy ; do #for Q in PCC ACC PACC EMQ HDy ; do
for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do # for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
# PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons # PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons
PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0 # PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
done # done
#done
dataset=RCV1.C4
iter=40
k=100
initsize=500
initprev=-1
seed=1
Q=RPACC
CLS=lr
for sampling in relevance_sampling uncertainty_sampling adaptive_sampling mix_sampling ; do
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
done done

View File

@ -6,6 +6,7 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC from sklearn.svm import LinearSVC, SVC
import quapy as qp import quapy as qp
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
import numpy as np import numpy as np
@ -18,7 +19,8 @@ def NewClassifier(classifiername):
if classifiername== 'lr': if classifiername== 'lr':
return LogisticRegression(class_weight='balanced') return LogisticRegression(class_weight='balanced')
elif classifiername== 'svm': elif classifiername== 'svm':
return SVC(class_weight='balanced', probability=True, kernel='linear') # return SVC(class_weight='balanced', probability=True, kernel='linear')
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
def NewQuantifier(quantifiername, classifiername): def NewQuantifier(quantifiername, classifiername):
@ -30,9 +32,18 @@ def NewQuantifier(quantifiername, classifiername):
if quantifiername == 'PCC': if quantifiername == 'PCC':
return PCC(NewClassifier(classifiername)) return PCC(NewClassifier(classifiername))
if quantifiername == 'ACC': if quantifiername == 'ACC':
return ACC(NewClassifier(classifiername), val_split=5) return ACC(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'PACC': if quantifiername == 'PACC':
return PACC(NewClassifier(classifiername), val_split=5) return PACC(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'RACC':
return RegionAdjustment(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'RPACC':
return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10)
if quantifiername == 'GRPACC':
def newQ():
# return PACC(NewClassifier(classifiername), val_split=0.4)
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
raise ValueError('unknown quantifier', quantifiername) raise ValueError('unknown quantifier', quantifiername)
@ -136,10 +147,7 @@ def create_dataset(datasetname):
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str): def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
q = CC(NewClassifier(classifiername)).fit(train) q = CC(NewClassifier(classifiername)).fit(train)
# q = NewQuantifier("PCC").fit(train)
return q.quantify(pool.instances), q.learner return q.quantify(pool.instances), q.learner
# small_pool = pool.sampling(100, *pool.prevalence())
# return q.quantify(small_pool.instances), q.learner
def estimate_prev_Q(train, pool, quantifiername, classifiername): def estimate_prev_Q(train, pool, quantifiername, classifiername):
@ -152,21 +160,10 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
# n_repetitions=10) # n_repetitions=10)
q = NewQuantifier(quantifiername, classifiername) q = NewQuantifier(quantifiername, classifiername)
# q = ACC(NewClassifier()) # q._find_regions((train+pool).instances)
# borrow (supposedly negative) pool documents
# train_pos = train.counts()[1]
# train_negs = train.counts()[0]
# neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
# neg_sample = pool.sampling_from_index(neg_idx)
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
# q.fit(train_augmented)
q.fit(train) q.fit(train)
# q.fit(first_train)
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
prev = q.quantify(pool.instances) prev = q.quantify(pool.instances)
return prev, q.learner return prev, None
# small_pool = pool.sampling(100, *pool.prevalence())
# return q.quantify(small_pool.instances), q.learner

View File

@ -1,9 +1,12 @@
import os.path import os.path
import pathlib
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
import functions as fn import functions as fn
import quapy as qp import quapy as qp
import argparse import argparse
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from plot import eDiscoveryPlot
def eval_classifier(learner, test:LabelledCollection): def eval_classifier(learner, test:LabelledCollection):
@ -22,15 +25,14 @@ def main(args):
init_nD = args.initsize init_nD = args.initsize
sampling_fn = getattr(fn, args.sampling) sampling_fn = getattr(fn, args.sampling)
max_iterations = args.iter max_iterations = args.iter
outputdir = './results'
clf_name = args.classifier clf_name = args.classifier
q_name = args.quantifier q_name = args.quantifier
qp.util.create_if_not_exist(outputdir)
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname) collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
nD = len(collection) nD = len(collection)
fig = eDiscoveryPlot(args.output)
with qp.util.temp_seed(args.seed): with qp.util.temp_seed(args.seed):
# initial labelled data selection # initial labelled data selection
if args.initprev == -1: if args.initprev == -1:
@ -42,7 +44,13 @@ def main(args):
# recall_target = 0.99 # recall_target = 0.99
i = 0 i = 0
with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo:
# q = fn.NewQuantifier(q_name, clf_name)
# print('searching regions')
# q._find_regions((train+pool).instances)
# print('[done]')
with open(args.output, 'wt') as foo:
def tee(msg): def tee(msg):
foo.write(msg + '\n') foo.write(msg + '\n')
foo.flush() foo.flush()
@ -54,9 +62,12 @@ def main(args):
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name) pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
# q.fit(train)
# pool_p_hat_q = q.quantify(pool.instances)
# q_classifier = q.learner
f1_clf = eval_classifier(classifier, pool) f1_clf = eval_classifier(classifier, pool)
f1_q = eval_classifier(q_classifier, pool) f1_q = 0 #eval_classifier(q_classifier, pool)
tr_p = train.prevalence() tr_p = train.prevalence()
te_p = pool.prevalence() te_p = pool.prevalence()
@ -76,6 +87,8 @@ def main(args):
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
fig.plot()
if nDte < k: if nDte < k:
print('[stop] too few documents remaining') print('[stop] too few documents remaining')
break break
@ -112,10 +125,21 @@ if __name__ == '__main__':
parser.add_argument('--classifier', metavar='CLS', type=str, parser.add_argument('--classifier', metavar='CLS', type=str,
help='classifier type (svm, lr)', help='classifier type (svm, lr)',
default='lr') default='lr')
parser.add_argument('--output', metavar='OUT', type=str,
help="name of the file containing the results of the experiment (default is an automatic "
"filename based on the model's parameters in the folder './results/')",
default=None)
args = parser.parse_args() args = parser.parse_args()
assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)' assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0 if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0
args.initprev = int(args.initprev) args.initprev = int(args.initprev)
if args.output is None:
outputdir = './results'
args.output = os.path.join(outputdir, fn.experiment_name(args))
else:
outputdir = pathlib.Path(args.output).parent.name
if outputdir:
qp.util.create_if_not_exist(outputdir)
main(args) main(args)

View File

@ -1,29 +1,34 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
import pandas as pd import pandas as pd
import sys, os, pathlib import sys, os, pathlib
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = str(sys.argv[1]) class eDiscoveryPlot:
loop = bool(int(sys.argv[2]))
print(file) def __init__(self, datapath, outdir='./plots', loop=True, save=True):
self.outdir = outdir
self.datapath = datapath
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
self.loop = loop
self.save = save
plotname = pathlib.Path(file).name.replace(".csv", ".png") if not loop:
plt.rcParams['figure.figsize'] = [12, 12]
if not loop: plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = [12, 12] else:
plt.rcParams['figure.dpi'] = 200 plt.rcParams['figure.figsize'] = [17, 17]
plt.rcParams['figure.dpi'] = 60
# plot the data
fig, axs = plt.subplots(5)
try: # plot the data
while True: self.fig, self.axs = plt.subplots(5)
def plot(self):
fig, axs = self.fig, self.axs
loop, save = self.loop, self.save
aXn = 0 aXn = 0
df = pd.read_csv(file, sep='\t') df = pd.read_csv(self.datapath, sep='\t')
xs = df['it'] xs = df['it']
@ -36,8 +41,8 @@ try:
axs[aXn].legend() axs[aXn].legend()
axs[aXn].grid() axs[aXn].grid()
axs[aXn].set_ylabel('Recall') axs[aXn].set_ylabel('Recall')
axs[aXn].set_ylim(0,1) axs[aXn].set_ylim(0, 1)
aXn+=1 aXn += 1
y_r = df['te-prev'] y_r = df['te-prev']
y_rhat = df['te-estim'] y_rhat = df['te-estim']
@ -74,15 +79,27 @@ try:
axs[aXn].set_ylabel('Train-Test Shift') axs[aXn].set_ylabel('Train-Test Shift')
aXn += 1 aXn += 1
os.makedirs('./plots', exist_ok=True) if save:
plt.savefig(f'./plots/{plotname}') os.makedirs(self.outdir, exist_ok=True)
plt.savefig(f'{self.outdir}/{self.plotname}')
if not loop: if loop:
break
else:
plt.pause(.5) plt.pause(.5)
for i in range(aXn): for i in range(aXn):
axs[i].cla() axs[i].cla()
except KeyboardInterrupt:
print("\n[exit]") if __name__ == '__main__':
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = str(sys.argv[1])
loop = bool(int(sys.argv[2]))
figure = eDiscoveryPlot(file)
try:
figure.plot(loop)
except KeyboardInterrupt:
print('\n[stop]')

View File

@ -2,17 +2,17 @@
set -x set -x
dataset=RCV1.C4 dataset=RCV1.C4
iter=50 iter=100
k=100 k=100
initsize=500 initsize=500
initprev=-1 initprev=-1
seed=1 seed=1
Q=ACC Q=GRPACC
CLS=lr CLS=lr
sampling=proportional_sampling sampling=relevance_sampling
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv" filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed & PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
sleep 2 #sleep 2
PYTHONPATH='.:..' python3 plot.py $filepath 1 #PYTHONPATH='.:..' python3 plot.py $filepath 1

View File

@ -200,9 +200,9 @@ class ACC(AggregativeQuantifier):
# kFCV estimation of parameters # kFCV estimation of parameters
y, y_ = [], [] y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split) kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split) # pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar): for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') # pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx) training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx) validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
@ -289,9 +289,8 @@ class PACC(AggregativeProbabilisticQuantifier):
# kFCV estimation of parameters # kFCV estimation of parameters
y, y_ = [], [] y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split) kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split) for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
for k, (training_idx, validation_idx) in enumerate(pbar): # pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx) training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx) validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper( learner, val_data = training_helper(