going into region quantification
This commit is contained in:
parent
14dbfb567b
commit
3aed410722
|
@ -1,18 +1,34 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
dataset=RCV1.C4
|
||||
iter=50
|
||||
k=100
|
||||
initsize=1000
|
||||
initprev=0.5
|
||||
seed=1
|
||||
|
||||
commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
|
||||
|
||||
for Q in PCC ACC PACC EMQ HDy ; do
|
||||
for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
|
||||
#dataset=RCV1.C4
|
||||
#iter=50
|
||||
#k=100
|
||||
#initsize=1000
|
||||
#initprev=0.5
|
||||
#seed=1
|
||||
#
|
||||
#commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
|
||||
#
|
||||
#for Q in PCC ACC PACC EMQ HDy ; do
|
||||
# for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
|
||||
# PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons
|
||||
PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
|
||||
done
|
||||
# PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
|
||||
# done
|
||||
#done
|
||||
|
||||
dataset=RCV1.C4
|
||||
iter=40
|
||||
k=100
|
||||
initsize=500
|
||||
initprev=-1
|
||||
seed=1
|
||||
Q=RPACC
|
||||
CLS=lr
|
||||
|
||||
for sampling in relevance_sampling uncertainty_sampling adaptive_sampling mix_sampling ; do
|
||||
|
||||
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
||||
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
|
||||
|
||||
done
|
||||
|
|
|
@ -6,6 +6,7 @@ from sklearn.linear_model import LogisticRegression
|
|||
from sklearn.svm import LinearSVC, SVC
|
||||
|
||||
import quapy as qp
|
||||
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
||||
import numpy as np
|
||||
|
@ -18,7 +19,8 @@ def NewClassifier(classifiername):
|
|||
if classifiername== 'lr':
|
||||
return LogisticRegression(class_weight='balanced')
|
||||
elif classifiername== 'svm':
|
||||
return SVC(class_weight='balanced', probability=True, kernel='linear')
|
||||
# return SVC(class_weight='balanced', probability=True, kernel='linear')
|
||||
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
|
||||
|
||||
|
||||
def NewQuantifier(quantifiername, classifiername):
|
||||
|
@ -30,9 +32,18 @@ def NewQuantifier(quantifiername, classifiername):
|
|||
if quantifiername == 'PCC':
|
||||
return PCC(NewClassifier(classifiername))
|
||||
if quantifiername == 'ACC':
|
||||
return ACC(NewClassifier(classifiername), val_split=5)
|
||||
return ACC(NewClassifier(classifiername), val_split=0.4)
|
||||
if quantifiername == 'PACC':
|
||||
return PACC(NewClassifier(classifiername), val_split=5)
|
||||
return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||
if quantifiername == 'RACC':
|
||||
return RegionAdjustment(NewClassifier(classifiername), val_split=0.4)
|
||||
if quantifiername == 'RPACC':
|
||||
return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10)
|
||||
if quantifiername == 'GRPACC':
|
||||
def newQ():
|
||||
# return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
||||
raise ValueError('unknown quantifier', quantifiername)
|
||||
|
||||
|
||||
|
@ -136,10 +147,7 @@ def create_dataset(datasetname):
|
|||
|
||||
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
|
||||
q = CC(NewClassifier(classifiername)).fit(train)
|
||||
# q = NewQuantifier("PCC").fit(train)
|
||||
return q.quantify(pool.instances), q.learner
|
||||
# small_pool = pool.sampling(100, *pool.prevalence())
|
||||
# return q.quantify(small_pool.instances), q.learner
|
||||
|
||||
|
||||
def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
||||
|
@ -152,21 +160,10 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
|||
# n_repetitions=10)
|
||||
|
||||
q = NewQuantifier(quantifiername, classifiername)
|
||||
# q = ACC(NewClassifier())
|
||||
# borrow (supposedly negative) pool documents
|
||||
# train_pos = train.counts()[1]
|
||||
# train_negs = train.counts()[0]
|
||||
# neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
|
||||
# neg_sample = pool.sampling_from_index(neg_idx)
|
||||
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
|
||||
# q.fit(train_augmented)
|
||||
# q._find_regions((train+pool).instances)
|
||||
q.fit(train)
|
||||
# q.fit(first_train)
|
||||
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
|
||||
|
||||
prev = q.quantify(pool.instances)
|
||||
return prev, q.learner
|
||||
# small_pool = pool.sampling(100, *pool.prevalence())
|
||||
# return q.quantify(small_pool.instances), q.learner
|
||||
return prev, None
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
import os.path
|
||||
import pathlib
|
||||
|
||||
from sklearn.metrics import f1_score
|
||||
import functions as fn
|
||||
import quapy as qp
|
||||
import argparse
|
||||
from quapy.data import LabelledCollection
|
||||
from plot import eDiscoveryPlot
|
||||
|
||||
|
||||
def eval_classifier(learner, test:LabelledCollection):
|
||||
|
@ -22,15 +25,14 @@ def main(args):
|
|||
init_nD = args.initsize
|
||||
sampling_fn = getattr(fn, args.sampling)
|
||||
max_iterations = args.iter
|
||||
outputdir = './results'
|
||||
clf_name = args.classifier
|
||||
q_name = args.quantifier
|
||||
|
||||
qp.util.create_if_not_exist(outputdir)
|
||||
|
||||
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
|
||||
nD = len(collection)
|
||||
|
||||
fig = eDiscoveryPlot(args.output)
|
||||
|
||||
with qp.util.temp_seed(args.seed):
|
||||
# initial labelled data selection
|
||||
if args.initprev == -1:
|
||||
|
@ -42,7 +44,13 @@ def main(args):
|
|||
|
||||
# recall_target = 0.99
|
||||
i = 0
|
||||
with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo:
|
||||
|
||||
# q = fn.NewQuantifier(q_name, clf_name)
|
||||
# print('searching regions')
|
||||
# q._find_regions((train+pool).instances)
|
||||
# print('[done]')
|
||||
|
||||
with open(args.output, 'wt') as foo:
|
||||
def tee(msg):
|
||||
foo.write(msg + '\n')
|
||||
foo.flush()
|
||||
|
@ -54,9 +62,12 @@ def main(args):
|
|||
|
||||
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
||||
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
||||
# q.fit(train)
|
||||
# pool_p_hat_q = q.quantify(pool.instances)
|
||||
# q_classifier = q.learner
|
||||
|
||||
f1_clf = eval_classifier(classifier, pool)
|
||||
f1_q = eval_classifier(q_classifier, pool)
|
||||
f1_q = 0 #eval_classifier(q_classifier, pool)
|
||||
|
||||
tr_p = train.prevalence()
|
||||
te_p = pool.prevalence()
|
||||
|
@ -76,6 +87,8 @@ def main(args):
|
|||
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
|
||||
|
||||
fig.plot()
|
||||
|
||||
if nDte < k:
|
||||
print('[stop] too few documents remaining')
|
||||
break
|
||||
|
@ -112,10 +125,21 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--classifier', metavar='CLS', type=str,
|
||||
help='classifier type (svm, lr)',
|
||||
default='lr')
|
||||
parser.add_argument('--output', metavar='OUT', type=str,
|
||||
help="name of the file containing the results of the experiment (default is an automatic "
|
||||
"filename based on the model's parameters in the folder './results/')",
|
||||
default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
|
||||
if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0
|
||||
args.initprev = int(args.initprev)
|
||||
if args.output is None:
|
||||
outputdir = './results'
|
||||
args.output = os.path.join(outputdir, fn.experiment_name(args))
|
||||
else:
|
||||
outputdir = pathlib.Path(args.output).parent.name
|
||||
if outputdir:
|
||||
qp.util.create_if_not_exist(outputdir)
|
||||
|
||||
main(args)
|
||||
|
|
|
@ -1,29 +1,34 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import sys, os, pathlib
|
||||
|
||||
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
||||
|
||||
file = str(sys.argv[1])
|
||||
loop = bool(int(sys.argv[2]))
|
||||
class eDiscoveryPlot:
|
||||
|
||||
print(file)
|
||||
def __init__(self, datapath, outdir='./plots', loop=True, save=True):
|
||||
self.outdir = outdir
|
||||
self.datapath = datapath
|
||||
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
|
||||
self.loop = loop
|
||||
self.save = save
|
||||
|
||||
plotname = pathlib.Path(file).name.replace(".csv", ".png")
|
||||
|
||||
if not loop:
|
||||
plt.rcParams['figure.figsize'] = [12, 12]
|
||||
plt.rcParams['figure.dpi'] = 200
|
||||
|
||||
# plot the data
|
||||
fig, axs = plt.subplots(5)
|
||||
if not loop:
|
||||
plt.rcParams['figure.figsize'] = [12, 12]
|
||||
plt.rcParams['figure.dpi'] = 200
|
||||
else:
|
||||
plt.rcParams['figure.figsize'] = [17, 17]
|
||||
plt.rcParams['figure.dpi'] = 60
|
||||
|
||||
|
||||
try:
|
||||
while True:
|
||||
# plot the data
|
||||
self.fig, self.axs = plt.subplots(5)
|
||||
|
||||
def plot(self):
|
||||
fig, axs = self.fig, self.axs
|
||||
loop, save = self.loop, self.save
|
||||
|
||||
aXn = 0
|
||||
df = pd.read_csv(file, sep='\t')
|
||||
df = pd.read_csv(self.datapath, sep='\t')
|
||||
|
||||
xs = df['it']
|
||||
|
||||
|
@ -36,8 +41,8 @@ try:
|
|||
axs[aXn].legend()
|
||||
axs[aXn].grid()
|
||||
axs[aXn].set_ylabel('Recall')
|
||||
axs[aXn].set_ylim(0,1)
|
||||
aXn+=1
|
||||
axs[aXn].set_ylim(0, 1)
|
||||
aXn += 1
|
||||
|
||||
y_r = df['te-prev']
|
||||
y_rhat = df['te-estim']
|
||||
|
@ -74,15 +79,27 @@ try:
|
|||
axs[aXn].set_ylabel('Train-Test Shift')
|
||||
aXn += 1
|
||||
|
||||
os.makedirs('./plots', exist_ok=True)
|
||||
plt.savefig(f'./plots/{plotname}')
|
||||
if save:
|
||||
os.makedirs(self.outdir, exist_ok=True)
|
||||
plt.savefig(f'{self.outdir}/{self.plotname}')
|
||||
|
||||
if not loop:
|
||||
break
|
||||
else:
|
||||
if loop:
|
||||
plt.pause(.5)
|
||||
for i in range(aXn):
|
||||
axs[i].cla()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n[exit]")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
||||
|
||||
file = str(sys.argv[1])
|
||||
loop = bool(int(sys.argv[2]))
|
||||
|
||||
figure = eDiscoveryPlot(file)
|
||||
|
||||
try:
|
||||
figure.plot(loop)
|
||||
except KeyboardInterrupt:
|
||||
print('\n[stop]')
|
||||
|
||||
|
|
|
@ -2,17 +2,17 @@
|
|||
set -x
|
||||
|
||||
dataset=RCV1.C4
|
||||
iter=50
|
||||
iter=100
|
||||
k=100
|
||||
initsize=500
|
||||
initprev=-1
|
||||
seed=1
|
||||
Q=ACC
|
||||
Q=GRPACC
|
||||
CLS=lr
|
||||
sampling=proportional_sampling
|
||||
sampling=relevance_sampling
|
||||
|
||||
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
||||
|
||||
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed &
|
||||
sleep 2
|
||||
PYTHONPATH='.:..' python3 plot.py $filepath 1
|
||||
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
|
||||
#sleep 2
|
||||
#PYTHONPATH='.:..' python3 plot.py $filepath 1
|
||||
|
|
|
@ -200,9 +200,9 @@ class ACC(AggregativeQuantifier):
|
|||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
# pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
|
||||
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||
|
@ -289,9 +289,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
|
||||
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = training_helper(
|
||||
|
|
Loading…
Reference in New Issue