1
0
Fork 0

plotting y distributions over time

This commit is contained in:
Alejandro Moreo Fernandez 2022-02-23 15:29:39 +01:00
parent 9053ae34a6
commit 29993386ae
3 changed files with 73 additions and 24 deletions

View File

@ -162,14 +162,6 @@ def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
def estimate_prev_Q(train, pool, quantifiername, classifiername): def estimate_prev_Q(train, pool, quantifiername, classifiername):
# q = qp.model_selection.GridSearchQ(
# ACC(LogisticRegression()),
# param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
# sample_size=len(train),
# protocol='app',
# n_prevpoints=21,
# n_repetitions=10)
q = NewQuantifier(quantifiername, classifiername) q = NewQuantifier(quantifiername, classifiername)
# q._find_regions((train+pool).instances) # q._find_regions((train+pool).instances)
q.fit(train) q.fit(train)
@ -181,16 +173,14 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
def eval_classifier(learner, test:LabelledCollection): def eval_classifier(learner, test:LabelledCollection):
predictions = learner.predict(test.instances) predictions = learner.predict(test.instances)
true_labels = test.labels true_labels = test.labels
# f1 = f1_score(true_labels, predictions, average='macro')
f1 = f1_score(true_labels, predictions, average='binary') f1 = f1_score(true_labels, predictions, average='binary')
# f1 = (true_labels==predictions).mean()
return f1 return f1
def ideal_cost(classifier, pool): def ideal_cost(classifier, pool):
# returns the cost (in terms of number of documents) to review until the last relevant document # returns the cost (in terms of number of documents) to review until the last relevant document
# is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since # is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
# one assumes to be able to stop reviewing when the last relevant is encountered # one assumes to know the optimal stopping point (reached after the last relevant is encountered)
prob = classifier.predict_proba(pool.instances) prob = classifier.predict_proba(pool.instances)
order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance" order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance"

View File

@ -6,8 +6,7 @@ import functions as fn
import quapy as qp import quapy as qp
import argparse import argparse
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from plot import eDiscoveryPlot from plot import eDiscoveryPlot, InOutDistPlot
def main(args): def main(args):
@ -23,9 +22,10 @@ def main(args):
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname) collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
nD = len(collection) nD = len(collection)
fig = eDiscoveryPlot(args.output) # fig = eDiscoveryPlot(args.output)
fig_dist = InOutDistPlot()
skip_first_steps = 20 skip_first_steps = 1
with qp.util.temp_seed(args.seed): with qp.util.temp_seed(args.seed):
# initial labelled data selection # initial labelled data selection
@ -34,23 +34,20 @@ def main(args):
else: else:
idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev]) idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev])
train, pool = fn.split_from_index(collection, idx) train, pool = fn.split_from_index(collection, idx)
#first_train = LabelledCollection(train.instances, train.labels)
# recall_target = 0.99 # recall_target = 0.99
i = 0 i = 0
# q = fn.NewQuantifier(q_name, clf_name)
# print('searching regions')
# q._find_regions((train+pool).instances)
# print('[done]')
with open(args.output, 'wt') as foo: with open(args.output, 'wt') as foo:
def tee(msg): def tee(msg):
foo.write(msg + '\n') foo.write(msg + '\n')
foo.flush() foo.flush()
print(msg) print(msg)
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining') tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC'
'\tMF1_Q\tMF1_Clf\tICost\tremaining\tba-prev\tba-estim')
batch_prev_estim, batch_prev_true, q = 0, 0, None
while True: while True:
@ -85,10 +82,12 @@ def main(args):
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}' f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
f'\t{ideal_cost}\t{pool.labels.sum()}') f'\t{ideal_cost}\t{pool.labels.sum()}\t{batch_prev_true}\t{batch_prev_estim:.3f}')
posteriors = classifier.predict_proba(pool.instances) posteriors = classifier.predict_proba(pool.instances)
fig.plot(posteriors, pool.labels) in_posteriors = classifier.predict_proba(train.instances)
# fig.plot(posteriors, pool.labels)
fig_dist.plot(in_posteriors, train.labels, posteriors, pool.labels)
if nDte < k: if nDte < k:
print('[stop] too few documents remaining') print('[stop] too few documents remaining')
@ -98,6 +97,12 @@ def main(args):
break break
top_relevant_idx = sampling_fn(pool, classifier, k, progress) top_relevant_idx = sampling_fn(pool, classifier, k, progress)
if q is not None:
batch = pool.sampling_from_index(top_relevant_idx)
batch_prev_estim = q.quantify(batch.instances)[1]
batch_prev_true = batch.prevalence()[1]
train, pool = fn.move_documents(train, pool, top_relevant_idx) train, pool = fn.move_documents(train, pool, top_relevant_idx)
i += 1 i += 1

View File

@ -142,6 +142,60 @@ class eDiscoveryPlot:
self.calls += 1 self.calls += 1
class InOutDistPlot:
def __init__(self, refreshEach=1):
self.refreshEach = refreshEach
# plot the data
self.fig, self.axs = plt.subplots(2)
self.calls = 0
def _plot_dist(self, posteriors, y, aXn, title):
positive_posteriors = posteriors[y == 1, 1]
negative_posteriors = posteriors[y == 0, 1]
self.axs[aXn].hist(negative_posteriors, bins=50, label='$Pr(x|\ominus)$', density=False, alpha=.75)
self.axs[aXn].hist(positive_posteriors, bins=50, label='$Pr(x|\oplus)$', density=False, alpha=.75)
self.axs[aXn].legend()
self.axs[aXn].grid()
self.axs[aXn].set_xlim(0, 1)
self.axs[aXn].set_ylabel(title)
def plot(self, in_posteriors, in_y, out_posteriors, out_y):
if (self.calls+1) % self.refreshEach != 0:
self.calls += 1
return
fig, axs = self.fig, self.axs
aXn = 0
# in-posteriors distribution
self._plot_dist(in_posteriors, in_y, aXn, title='training distribution')
aXn += 1
# out-posteriors distribution
self._plot_dist(out_posteriors, out_y, aXn, title='pool distribution')
aXn += 1
for i in range(aXn):
if self.calls==0:
# Shrink current axis by 20%
box = axs[i].get_position()
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
fig.tight_layout()
# Put a legend to the right of the current axis
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.pause(.5)
for i in range(aXn):
axs[i].cla()
self.calls += 1
if __name__ == '__main__': if __name__ == '__main__':
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>' assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'