forked from moreo/QuaPy
plotting y distributions over time
This commit is contained in:
parent
9053ae34a6
commit
29993386ae
|
@ -162,14 +162,6 @@ def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
|
|||
|
||||
|
||||
def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
||||
# q = qp.model_selection.GridSearchQ(
|
||||
# ACC(LogisticRegression()),
|
||||
# param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']},
|
||||
# sample_size=len(train),
|
||||
# protocol='app',
|
||||
# n_prevpoints=21,
|
||||
# n_repetitions=10)
|
||||
|
||||
q = NewQuantifier(quantifiername, classifiername)
|
||||
# q._find_regions((train+pool).instances)
|
||||
q.fit(train)
|
||||
|
@ -181,16 +173,14 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
|||
def eval_classifier(learner, test:LabelledCollection):
|
||||
predictions = learner.predict(test.instances)
|
||||
true_labels = test.labels
|
||||
# f1 = f1_score(true_labels, predictions, average='macro')
|
||||
f1 = f1_score(true_labels, predictions, average='binary')
|
||||
# f1 = (true_labels==predictions).mean()
|
||||
return f1
|
||||
|
||||
|
||||
def ideal_cost(classifier, pool):
|
||||
# returns the cost (in terms of number of documents) to review until the last relevant document
|
||||
# is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
|
||||
# one assumes to be able to stop reviewing when the last relevant is encountered
|
||||
# one assumes to know the optimal stopping point (reached after the last relevant is encountered)
|
||||
|
||||
prob = classifier.predict_proba(pool.instances)
|
||||
order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance"
|
||||
|
|
|
@ -6,8 +6,7 @@ import functions as fn
|
|||
import quapy as qp
|
||||
import argparse
|
||||
from quapy.data import LabelledCollection
|
||||
from plot import eDiscoveryPlot
|
||||
|
||||
from plot import eDiscoveryPlot, InOutDistPlot
|
||||
|
||||
|
||||
def main(args):
|
||||
|
@ -23,9 +22,10 @@ def main(args):
|
|||
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
|
||||
nD = len(collection)
|
||||
|
||||
fig = eDiscoveryPlot(args.output)
|
||||
# fig = eDiscoveryPlot(args.output)
|
||||
fig_dist = InOutDistPlot()
|
||||
|
||||
skip_first_steps = 20
|
||||
skip_first_steps = 1
|
||||
|
||||
with qp.util.temp_seed(args.seed):
|
||||
# initial labelled data selection
|
||||
|
@ -34,23 +34,20 @@ def main(args):
|
|||
else:
|
||||
idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev])
|
||||
train, pool = fn.split_from_index(collection, idx)
|
||||
#first_train = LabelledCollection(train.instances, train.labels)
|
||||
|
||||
# recall_target = 0.99
|
||||
i = 0
|
||||
|
||||
# q = fn.NewQuantifier(q_name, clf_name)
|
||||
# print('searching regions')
|
||||
# q._find_regions((train+pool).instances)
|
||||
# print('[done]')
|
||||
|
||||
with open(args.output, 'wt') as foo:
|
||||
def tee(msg):
|
||||
foo.write(msg + '\n')
|
||||
foo.flush()
|
||||
print(msg)
|
||||
|
||||
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining')
|
||||
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC'
|
||||
'\tMF1_Q\tMF1_Clf\tICost\tremaining\tba-prev\tba-estim')
|
||||
|
||||
batch_prev_estim, batch_prev_true, q = 0, 0, None
|
||||
|
||||
while True:
|
||||
|
||||
|
@ -85,10 +82,12 @@ def main(args):
|
|||
|
||||
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
|
||||
f'\t{ideal_cost}\t{pool.labels.sum()}')
|
||||
f'\t{ideal_cost}\t{pool.labels.sum()}\t{batch_prev_true}\t{batch_prev_estim:.3f}')
|
||||
|
||||
posteriors = classifier.predict_proba(pool.instances)
|
||||
fig.plot(posteriors, pool.labels)
|
||||
in_posteriors = classifier.predict_proba(train.instances)
|
||||
# fig.plot(posteriors, pool.labels)
|
||||
fig_dist.plot(in_posteriors, train.labels, posteriors, pool.labels)
|
||||
|
||||
if nDte < k:
|
||||
print('[stop] too few documents remaining')
|
||||
|
@ -98,6 +97,12 @@ def main(args):
|
|||
break
|
||||
|
||||
top_relevant_idx = sampling_fn(pool, classifier, k, progress)
|
||||
|
||||
if q is not None:
|
||||
batch = pool.sampling_from_index(top_relevant_idx)
|
||||
batch_prev_estim = q.quantify(batch.instances)[1]
|
||||
batch_prev_true = batch.prevalence()[1]
|
||||
|
||||
train, pool = fn.move_documents(train, pool, top_relevant_idx)
|
||||
|
||||
i += 1
|
||||
|
|
|
@ -142,6 +142,60 @@ class eDiscoveryPlot:
|
|||
self.calls += 1
|
||||
|
||||
|
||||
class InOutDistPlot:
|
||||
|
||||
def __init__(self, refreshEach=1):
|
||||
self.refreshEach = refreshEach
|
||||
|
||||
# plot the data
|
||||
self.fig, self.axs = plt.subplots(2)
|
||||
self.calls = 0
|
||||
|
||||
def _plot_dist(self, posteriors, y, aXn, title):
|
||||
positive_posteriors = posteriors[y == 1, 1]
|
||||
negative_posteriors = posteriors[y == 0, 1]
|
||||
self.axs[aXn].hist(negative_posteriors, bins=50, label='$Pr(x|\ominus)$', density=False, alpha=.75)
|
||||
self.axs[aXn].hist(positive_posteriors, bins=50, label='$Pr(x|\oplus)$', density=False, alpha=.75)
|
||||
self.axs[aXn].legend()
|
||||
self.axs[aXn].grid()
|
||||
self.axs[aXn].set_xlim(0, 1)
|
||||
self.axs[aXn].set_ylabel(title)
|
||||
|
||||
def plot(self, in_posteriors, in_y, out_posteriors, out_y):
|
||||
|
||||
if (self.calls+1) % self.refreshEach != 0:
|
||||
self.calls += 1
|
||||
return
|
||||
|
||||
fig, axs = self.fig, self.axs
|
||||
|
||||
aXn = 0
|
||||
|
||||
# in-posteriors distribution
|
||||
self._plot_dist(in_posteriors, in_y, aXn, title='training distribution')
|
||||
aXn += 1
|
||||
|
||||
# out-posteriors distribution
|
||||
self._plot_dist(out_posteriors, out_y, aXn, title='pool distribution')
|
||||
aXn += 1
|
||||
|
||||
for i in range(aXn):
|
||||
if self.calls==0:
|
||||
# Shrink current axis by 20%
|
||||
box = axs[i].get_position()
|
||||
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||
fig.tight_layout()
|
||||
|
||||
# Put a legend to the right of the current axis
|
||||
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||
|
||||
plt.pause(.5)
|
||||
for i in range(aXn):
|
||||
axs[i].cla()
|
||||
|
||||
self.calls += 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
||||
|
|
Loading…
Reference in New Issue