forked from moreo/QuaPy
going into region quantification
This commit is contained in:
parent
14dbfb567b
commit
3aed410722
|
@ -1,18 +1,34 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
dataset=RCV1.C4
|
#dataset=RCV1.C4
|
||||||
iter=50
|
#iter=50
|
||||||
k=100
|
#k=100
|
||||||
initsize=1000
|
#initsize=1000
|
||||||
initprev=0.5
|
#initprev=0.5
|
||||||
seed=1
|
#seed=1
|
||||||
|
#
|
||||||
commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
|
#commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
|
||||||
|
#
|
||||||
for Q in PCC ACC PACC EMQ HDy ; do
|
#for Q in PCC ACC PACC EMQ HDy ; do
|
||||||
for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
|
# for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
|
||||||
# PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons
|
# PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons
|
||||||
PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
|
# PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
|
||||||
done
|
# done
|
||||||
|
#done
|
||||||
|
|
||||||
|
dataset=RCV1.C4
|
||||||
|
iter=40
|
||||||
|
k=100
|
||||||
|
initsize=500
|
||||||
|
initprev=-1
|
||||||
|
seed=1
|
||||||
|
Q=RPACC
|
||||||
|
CLS=lr
|
||||||
|
|
||||||
|
for sampling in relevance_sampling uncertainty_sampling adaptive_sampling mix_sampling ; do
|
||||||
|
|
||||||
|
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
||||||
|
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
|
@ -6,6 +6,7 @@ from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import LinearSVC, SVC
|
from sklearn.svm import LinearSVC, SVC
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -18,7 +19,8 @@ def NewClassifier(classifiername):
|
||||||
if classifiername== 'lr':
|
if classifiername== 'lr':
|
||||||
return LogisticRegression(class_weight='balanced')
|
return LogisticRegression(class_weight='balanced')
|
||||||
elif classifiername== 'svm':
|
elif classifiername== 'svm':
|
||||||
return SVC(class_weight='balanced', probability=True, kernel='linear')
|
# return SVC(class_weight='balanced', probability=True, kernel='linear')
|
||||||
|
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
|
||||||
|
|
||||||
|
|
||||||
def NewQuantifier(quantifiername, classifiername):
|
def NewQuantifier(quantifiername, classifiername):
|
||||||
|
@ -30,9 +32,18 @@ def NewQuantifier(quantifiername, classifiername):
|
||||||
if quantifiername == 'PCC':
|
if quantifiername == 'PCC':
|
||||||
return PCC(NewClassifier(classifiername))
|
return PCC(NewClassifier(classifiername))
|
||||||
if quantifiername == 'ACC':
|
if quantifiername == 'ACC':
|
||||||
return ACC(NewClassifier(classifiername), val_split=5)
|
return ACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
if quantifiername == 'PACC':
|
if quantifiername == 'PACC':
|
||||||
return PACC(NewClassifier(classifiername), val_split=5)
|
return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
|
if quantifiername == 'RACC':
|
||||||
|
return RegionAdjustment(NewClassifier(classifiername), val_split=0.4)
|
||||||
|
if quantifiername == 'RPACC':
|
||||||
|
return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10)
|
||||||
|
if quantifiername == 'GRPACC':
|
||||||
|
def newQ():
|
||||||
|
# return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
|
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||||
|
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
||||||
raise ValueError('unknown quantifier', quantifiername)
|
raise ValueError('unknown quantifier', quantifiername)
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,10 +147,7 @@ def create_dataset(datasetname):
|
||||||
|
|
||||||
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
|
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
|
||||||
q = CC(NewClassifier(classifiername)).fit(train)
|
q = CC(NewClassifier(classifiername)).fit(train)
|
||||||
# q = NewQuantifier("PCC").fit(train)
|
|
||||||
return q.quantify(pool.instances), q.learner
|
return q.quantify(pool.instances), q.learner
|
||||||
# small_pool = pool.sampling(100, *pool.prevalence())
|
|
||||||
# return q.quantify(small_pool.instances), q.learner
|
|
||||||
|
|
||||||
|
|
||||||
def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
||||||
|
@ -152,21 +160,10 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
||||||
# n_repetitions=10)
|
# n_repetitions=10)
|
||||||
|
|
||||||
q = NewQuantifier(quantifiername, classifiername)
|
q = NewQuantifier(quantifiername, classifiername)
|
||||||
# q = ACC(NewClassifier())
|
# q._find_regions((train+pool).instances)
|
||||||
# borrow (supposedly negative) pool documents
|
|
||||||
# train_pos = train.counts()[1]
|
|
||||||
# train_negs = train.counts()[0]
|
|
||||||
# neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5))
|
|
||||||
# neg_sample = pool.sampling_from_index(neg_idx)
|
|
||||||
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
|
|
||||||
# q.fit(train_augmented)
|
|
||||||
q.fit(train)
|
q.fit(train)
|
||||||
# q.fit(first_train)
|
|
||||||
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
|
|
||||||
|
|
||||||
prev = q.quantify(pool.instances)
|
prev = q.quantify(pool.instances)
|
||||||
return prev, q.learner
|
return prev, None
|
||||||
# small_pool = pool.sampling(100, *pool.prevalence())
|
|
||||||
# return q.quantify(small_pool.instances), q.learner
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
import os.path
|
import os.path
|
||||||
|
import pathlib
|
||||||
|
|
||||||
from sklearn.metrics import f1_score
|
from sklearn.metrics import f1_score
|
||||||
import functions as fn
|
import functions as fn
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import argparse
|
import argparse
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
|
from plot import eDiscoveryPlot
|
||||||
|
|
||||||
|
|
||||||
def eval_classifier(learner, test:LabelledCollection):
|
def eval_classifier(learner, test:LabelledCollection):
|
||||||
|
@ -22,15 +25,14 @@ def main(args):
|
||||||
init_nD = args.initsize
|
init_nD = args.initsize
|
||||||
sampling_fn = getattr(fn, args.sampling)
|
sampling_fn = getattr(fn, args.sampling)
|
||||||
max_iterations = args.iter
|
max_iterations = args.iter
|
||||||
outputdir = './results'
|
|
||||||
clf_name = args.classifier
|
clf_name = args.classifier
|
||||||
q_name = args.quantifier
|
q_name = args.quantifier
|
||||||
|
|
||||||
qp.util.create_if_not_exist(outputdir)
|
|
||||||
|
|
||||||
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
|
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
|
||||||
nD = len(collection)
|
nD = len(collection)
|
||||||
|
|
||||||
|
fig = eDiscoveryPlot(args.output)
|
||||||
|
|
||||||
with qp.util.temp_seed(args.seed):
|
with qp.util.temp_seed(args.seed):
|
||||||
# initial labelled data selection
|
# initial labelled data selection
|
||||||
if args.initprev == -1:
|
if args.initprev == -1:
|
||||||
|
@ -42,7 +44,13 @@ def main(args):
|
||||||
|
|
||||||
# recall_target = 0.99
|
# recall_target = 0.99
|
||||||
i = 0
|
i = 0
|
||||||
with open(os.path.join(outputdir, fn.experiment_name(args)), 'wt') as foo:
|
|
||||||
|
# q = fn.NewQuantifier(q_name, clf_name)
|
||||||
|
# print('searching regions')
|
||||||
|
# q._find_regions((train+pool).instances)
|
||||||
|
# print('[done]')
|
||||||
|
|
||||||
|
with open(args.output, 'wt') as foo:
|
||||||
def tee(msg):
|
def tee(msg):
|
||||||
foo.write(msg + '\n')
|
foo.write(msg + '\n')
|
||||||
foo.flush()
|
foo.flush()
|
||||||
|
@ -54,9 +62,12 @@ def main(args):
|
||||||
|
|
||||||
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
||||||
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
||||||
|
# q.fit(train)
|
||||||
|
# pool_p_hat_q = q.quantify(pool.instances)
|
||||||
|
# q_classifier = q.learner
|
||||||
|
|
||||||
f1_clf = eval_classifier(classifier, pool)
|
f1_clf = eval_classifier(classifier, pool)
|
||||||
f1_q = eval_classifier(q_classifier, pool)
|
f1_q = 0 #eval_classifier(q_classifier, pool)
|
||||||
|
|
||||||
tr_p = train.prevalence()
|
tr_p = train.prevalence()
|
||||||
te_p = pool.prevalence()
|
te_p = pool.prevalence()
|
||||||
|
@ -76,6 +87,8 @@ def main(args):
|
||||||
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||||
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
|
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
|
||||||
|
|
||||||
|
fig.plot()
|
||||||
|
|
||||||
if nDte < k:
|
if nDte < k:
|
||||||
print('[stop] too few documents remaining')
|
print('[stop] too few documents remaining')
|
||||||
break
|
break
|
||||||
|
@ -112,10 +125,21 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--classifier', metavar='CLS', type=str,
|
parser.add_argument('--classifier', metavar='CLS', type=str,
|
||||||
help='classifier type (svm, lr)',
|
help='classifier type (svm, lr)',
|
||||||
default='lr')
|
default='lr')
|
||||||
|
parser.add_argument('--output', metavar='OUT', type=str,
|
||||||
|
help="name of the file containing the results of the experiment (default is an automatic "
|
||||||
|
"filename based on the model's parameters in the folder './results/')",
|
||||||
|
default=None)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
|
assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
|
||||||
if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0
|
if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0
|
||||||
args.initprev = int(args.initprev)
|
args.initprev = int(args.initprev)
|
||||||
|
if args.output is None:
|
||||||
|
outputdir = './results'
|
||||||
|
args.output = os.path.join(outputdir, fn.experiment_name(args))
|
||||||
|
else:
|
||||||
|
outputdir = pathlib.Path(args.output).parent.name
|
||||||
|
if outputdir:
|
||||||
|
qp.util.create_if_not_exist(outputdir)
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -1,29 +1,34 @@
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import sys, os, pathlib
|
import sys, os, pathlib
|
||||||
|
|
||||||
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
|
||||||
|
|
||||||
file = str(sys.argv[1])
|
class eDiscoveryPlot:
|
||||||
loop = bool(int(sys.argv[2]))
|
|
||||||
|
|
||||||
print(file)
|
def __init__(self, datapath, outdir='./plots', loop=True, save=True):
|
||||||
|
self.outdir = outdir
|
||||||
|
self.datapath = datapath
|
||||||
|
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
|
||||||
|
self.loop = loop
|
||||||
|
self.save = save
|
||||||
|
|
||||||
plotname = pathlib.Path(file).name.replace(".csv", ".png")
|
if not loop:
|
||||||
|
plt.rcParams['figure.figsize'] = [12, 12]
|
||||||
if not loop:
|
plt.rcParams['figure.dpi'] = 200
|
||||||
plt.rcParams['figure.figsize'] = [12, 12]
|
else:
|
||||||
plt.rcParams['figure.dpi'] = 200
|
plt.rcParams['figure.figsize'] = [17, 17]
|
||||||
|
plt.rcParams['figure.dpi'] = 60
|
||||||
# plot the data
|
|
||||||
fig, axs = plt.subplots(5)
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
# plot the data
|
||||||
while True:
|
self.fig, self.axs = plt.subplots(5)
|
||||||
|
|
||||||
|
def plot(self):
|
||||||
|
fig, axs = self.fig, self.axs
|
||||||
|
loop, save = self.loop, self.save
|
||||||
|
|
||||||
aXn = 0
|
aXn = 0
|
||||||
df = pd.read_csv(file, sep='\t')
|
df = pd.read_csv(self.datapath, sep='\t')
|
||||||
|
|
||||||
xs = df['it']
|
xs = df['it']
|
||||||
|
|
||||||
|
@ -36,8 +41,8 @@ try:
|
||||||
axs[aXn].legend()
|
axs[aXn].legend()
|
||||||
axs[aXn].grid()
|
axs[aXn].grid()
|
||||||
axs[aXn].set_ylabel('Recall')
|
axs[aXn].set_ylabel('Recall')
|
||||||
axs[aXn].set_ylim(0,1)
|
axs[aXn].set_ylim(0, 1)
|
||||||
aXn+=1
|
aXn += 1
|
||||||
|
|
||||||
y_r = df['te-prev']
|
y_r = df['te-prev']
|
||||||
y_rhat = df['te-estim']
|
y_rhat = df['te-estim']
|
||||||
|
@ -74,15 +79,27 @@ try:
|
||||||
axs[aXn].set_ylabel('Train-Test Shift')
|
axs[aXn].set_ylabel('Train-Test Shift')
|
||||||
aXn += 1
|
aXn += 1
|
||||||
|
|
||||||
os.makedirs('./plots', exist_ok=True)
|
if save:
|
||||||
plt.savefig(f'./plots/{plotname}')
|
os.makedirs(self.outdir, exist_ok=True)
|
||||||
|
plt.savefig(f'{self.outdir}/{self.plotname}')
|
||||||
|
|
||||||
if not loop:
|
if loop:
|
||||||
break
|
|
||||||
else:
|
|
||||||
plt.pause(.5)
|
plt.pause(.5)
|
||||||
for i in range(aXn):
|
for i in range(aXn):
|
||||||
axs[i].cla()
|
axs[i].cla()
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n[exit]")
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
||||||
|
|
||||||
|
file = str(sys.argv[1])
|
||||||
|
loop = bool(int(sys.argv[2]))
|
||||||
|
|
||||||
|
figure = eDiscoveryPlot(file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
figure.plot(loop)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('\n[stop]')
|
||||||
|
|
||||||
|
|
|
@ -2,17 +2,17 @@
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
dataset=RCV1.C4
|
dataset=RCV1.C4
|
||||||
iter=50
|
iter=100
|
||||||
k=100
|
k=100
|
||||||
initsize=500
|
initsize=500
|
||||||
initprev=-1
|
initprev=-1
|
||||||
seed=1
|
seed=1
|
||||||
Q=ACC
|
Q=GRPACC
|
||||||
CLS=lr
|
CLS=lr
|
||||||
sampling=proportional_sampling
|
sampling=relevance_sampling
|
||||||
|
|
||||||
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
||||||
|
|
||||||
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed &
|
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
|
||||||
sleep 2
|
#sleep 2
|
||||||
PYTHONPATH='.:..' python3 plot.py $filepath 1
|
#PYTHONPATH='.:..' python3 plot.py $filepath 1
|
||||||
|
|
|
@ -200,9 +200,9 @@ class ACC(AggregativeQuantifier):
|
||||||
# kFCV estimation of parameters
|
# kFCV estimation of parameters
|
||||||
y, y_ = [], []
|
y, y_ = [], []
|
||||||
kfcv = StratifiedKFold(n_splits=val_split)
|
kfcv = StratifiedKFold(n_splits=val_split)
|
||||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
# pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
|
||||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||||
training = data.sampling_from_index(training_idx)
|
training = data.sampling_from_index(training_idx)
|
||||||
validation = data.sampling_from_index(validation_idx)
|
validation = data.sampling_from_index(validation_idx)
|
||||||
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
|
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||||
|
@ -289,9 +289,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
||||||
# kFCV estimation of parameters
|
# kFCV estimation of parameters
|
||||||
y, y_ = [], []
|
y, y_ = [], []
|
||||||
kfcv = StratifiedKFold(n_splits=val_split)
|
kfcv = StratifiedKFold(n_splits=val_split)
|
||||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
|
||||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
|
||||||
training = data.sampling_from_index(training_idx)
|
training = data.sampling_from_index(training_idx)
|
||||||
validation = data.sampling_from_index(validation_idx)
|
validation = data.sampling_from_index(validation_idx)
|
||||||
learner, val_data = training_helper(
|
learner, val_data = training_helper(
|
||||||
|
|
Loading…
Reference in New Issue