forked from moreo/QuaPy
trying to create a quantifier that applies local adjustment conditioned on the posterior probabilities, which is not working
This commit is contained in:
parent
31fb154e01
commit
8a48f679a7
|
@ -3,11 +3,12 @@ import sklearn
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
from sklearn.svm import LinearSVC, SVC
|
from sklearn.svm import LinearSVC, SVC
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
|
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
|
||||||
ClassWeightPCC
|
ClassWeightPCC, PosteriorConditionalAdjustemnt
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -51,6 +52,8 @@ def NewQuantifier(quantifiername, classifiername):
|
||||||
# return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
|
# return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||||
return ClassWeightPCC()
|
return ClassWeightPCC()
|
||||||
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
||||||
|
if quantifiername == 'PCAD': # posterior-conditional adjustment
|
||||||
|
return PosteriorConditionalAdjustemnt()
|
||||||
raise ValueError('unknown quantifier', quantifiername)
|
raise ValueError('unknown quantifier', quantifiername)
|
||||||
|
|
||||||
|
|
||||||
|
@ -171,6 +174,26 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
||||||
q.fit(train)
|
q.fit(train)
|
||||||
|
|
||||||
prev = q.quantify(pool.instances)
|
prev = q.quantify(pool.instances)
|
||||||
return prev, None
|
return prev, q
|
||||||
|
|
||||||
|
|
||||||
|
def eval_classifier(learner, test:LabelledCollection):
|
||||||
|
predictions = learner.predict(test.instances)
|
||||||
|
true_labels = test.labels
|
||||||
|
# f1 = f1_score(true_labels, predictions, average='macro')
|
||||||
|
f1 = f1_score(true_labels, predictions, average='binary')
|
||||||
|
# f1 = (true_labels==predictions).mean()
|
||||||
|
return f1
|
||||||
|
|
||||||
|
|
||||||
|
def ideal_cost(classifier, pool):
|
||||||
|
# returns the cost (in terms of number of documents) to review until the last relevant document
|
||||||
|
# is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
|
||||||
|
# one assumes to be able to stop reviewing when the last relevant is encountered
|
||||||
|
|
||||||
|
prob = classifier.predict_proba(pool.instances)
|
||||||
|
order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance"
|
||||||
|
ranked_labels = pool.labels[order]
|
||||||
|
num_relevant = np.sum(pool.labels)
|
||||||
|
idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min()
|
||||||
|
return idealized_cost
|
|
@ -9,14 +9,6 @@ from quapy.data import LabelledCollection
|
||||||
from plot import eDiscoveryPlot
|
from plot import eDiscoveryPlot
|
||||||
|
|
||||||
|
|
||||||
def eval_classifier(learner, test:LabelledCollection):
|
|
||||||
predictions = learner.predict(test.instances)
|
|
||||||
true_labels = test.labels
|
|
||||||
# f1 = f1_score(true_labels, predictions, average='macro')
|
|
||||||
f1 = f1_score(true_labels, predictions, average='binary')
|
|
||||||
# f1 = (true_labels==predictions).mean()
|
|
||||||
return f1
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
||||||
|
@ -33,7 +25,7 @@ def main(args):
|
||||||
|
|
||||||
fig = eDiscoveryPlot(args.output)
|
fig = eDiscoveryPlot(args.output)
|
||||||
|
|
||||||
skip_first_steps = 0
|
skip_first_steps = 20
|
||||||
|
|
||||||
with qp.util.temp_seed(args.seed):
|
with qp.util.temp_seed(args.seed):
|
||||||
# initial labelled data selection
|
# initial labelled data selection
|
||||||
|
@ -58,28 +50,31 @@ def main(args):
|
||||||
foo.flush()
|
foo.flush()
|
||||||
print(msg)
|
print(msg)
|
||||||
|
|
||||||
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf')
|
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining')
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
||||||
|
ideal_cost = fn.ideal_cost(classifier, pool)
|
||||||
|
|
||||||
nDtr = len(train)
|
nDtr = len(train)
|
||||||
nDte = len(pool)
|
nDte = len(pool)
|
||||||
progress = 100 * nDtr / nD
|
progress = 100 * nDtr / nD
|
||||||
|
|
||||||
if i >= skip_first_steps:
|
if i >= skip_first_steps:
|
||||||
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
pool_p_hat_q, q = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
||||||
# q.fit(train)
|
|
||||||
# pool_p_hat_q = q.quantify(pool.instances)
|
|
||||||
# q_classifier = q.learner
|
|
||||||
|
|
||||||
f1_clf = eval_classifier(classifier, pool)
|
f1_clf = 0 # eval_classifier(classifier, pool)
|
||||||
f1_q = 0 #eval_classifier(q_classifier, pool)
|
f1_q = 0 #eval_classifier(q_classifier, pool)
|
||||||
|
|
||||||
tr_p = train.prevalence()
|
tr_p = train.prevalence()
|
||||||
te_p = pool.prevalence()
|
te_p = pool.prevalence()
|
||||||
|
|
||||||
|
# this is based on an observation by D.Lewis "it is convenient to have the same kind of systematic"
|
||||||
|
# error both in the numerator and in the denominator
|
||||||
|
#tr_p_hat = q.quantify(train.instances)
|
||||||
|
#r_hat_q = fn.recall(tr_p_hat, pool_p_hat_q, nDtr, nDte)
|
||||||
|
|
||||||
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
|
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
|
||||||
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
|
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
|
||||||
r = fn.recall(tr_p, te_p, nDtr, nDte)
|
r = fn.recall(tr_p, te_p, nDtr, nDte)
|
||||||
|
@ -89,9 +84,8 @@ def main(args):
|
||||||
ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
|
ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
|
||||||
|
|
||||||
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||||
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
|
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
|
||||||
|
f'\t{ideal_cost}\t{pool.labels.sum()}')
|
||||||
raise Exception('add idealized costs for each iteration and plots')
|
|
||||||
|
|
||||||
posteriors = classifier.predict_proba(pool.instances)
|
posteriors = classifier.predict_proba(pool.instances)
|
||||||
fig.plot(posteriors, pool.labels)
|
fig.plot(posteriors, pool.labels)
|
||||||
|
@ -124,7 +118,7 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
|
parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
|
||||||
default=2)
|
default=2)
|
||||||
parser.add_argument('--initprev', metavar='PREV', type=float,
|
parser.add_argument('--initprev', metavar='PREV', type=float,
|
||||||
help='prevalence of the initial sample (-1 for uniform sampling, default)',
|
help='prevalence of the initial sample (-1 for uniform sampling)',
|
||||||
default=0.5)
|
default=0.5)
|
||||||
parser.add_argument('--seed', metavar='SEED', type=int,
|
parser.add_argument('--seed', metavar='SEED', type=int,
|
||||||
help='random seed',
|
help='random seed',
|
||||||
|
|
|
@ -5,9 +5,12 @@ from sklearn.cluster import KMeans, OPTICS
|
||||||
from sklearn.decomposition import TruncatedSVD
|
from sklearn.decomposition import TruncatedSVD
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.mixture import GaussianMixture
|
from sklearn.mixture import GaussianMixture
|
||||||
|
from sklearn.model_selection import cross_val_predict
|
||||||
|
|
||||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import ACC, PACC, PCC
|
from quapy.method.aggregative import ACC, PACC, PCC
|
||||||
|
import quapy.functional as F
|
||||||
|
|
||||||
|
|
||||||
class RegionAdjustmentQ(BaseQuantifier):
|
class RegionAdjustmentQ(BaseQuantifier):
|
||||||
|
@ -319,6 +322,89 @@ class ClassWeightPCC(BaseQuantifier):
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return self.learner.get_params()
|
return self.learner.get_params()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self.train.classes_
|
||||||
|
|
||||||
|
|
||||||
|
class PosteriorConditionalAdjustemnt(BaseQuantifier):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.estimator = LogisticRegression()
|
||||||
|
self.k = 3
|
||||||
|
|
||||||
|
def get_adjustment_matrix(self, y, prob):
|
||||||
|
n_classes = 2
|
||||||
|
classes = [0, 1]
|
||||||
|
confusion = np.empty(shape=(n_classes, n_classes))
|
||||||
|
for i, class_ in enumerate(classes):
|
||||||
|
index = y == class_
|
||||||
|
if any(index):
|
||||||
|
confusion[i] = prob[index].mean(axis=0)
|
||||||
|
else:
|
||||||
|
if i == 0:
|
||||||
|
confusion[i] = np.asarray([1,0])
|
||||||
|
else:
|
||||||
|
confusion[i] = np.asarray([0, 1])
|
||||||
|
|
||||||
|
confusion = confusion.T
|
||||||
|
return confusion
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
|
X, y = data.Xy
|
||||||
|
proba = cross_val_predict(self.estimator, X, y, n_jobs=-1, method='predict_proba')
|
||||||
|
|
||||||
|
order = np.argsort(proba[:,1])
|
||||||
|
proba = proba[order]
|
||||||
|
y = y[order]
|
||||||
|
X = X[order] # to keep the alignment for the final classifier
|
||||||
|
n = len(data)
|
||||||
|
bucket_size = n // self.k
|
||||||
|
bucket_remainder = n % bucket_size
|
||||||
|
self.buckets = {}
|
||||||
|
self.prob_separations = []
|
||||||
|
for bucket in range(self.k):
|
||||||
|
from_pos = bucket*bucket_size
|
||||||
|
to_pos = (bucket+1)*bucket_size + (bucket_remainder if bucket==self.k-1 else 0)
|
||||||
|
slice_b = slice(from_pos, to_pos)
|
||||||
|
y_b = y[slice_b]
|
||||||
|
proba_b = proba[slice_b]
|
||||||
|
self.buckets[bucket] = self.get_adjustment_matrix(y_b, proba_b)
|
||||||
|
self.prob_separations.append(proba_b[-1,1])
|
||||||
|
self.prob_separations[-1] = 1 # the last one should account for the entire prob
|
||||||
|
|
||||||
|
self.estimator.fit(X,y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
proba = self.estimator.predict_proba(instances)
|
||||||
|
#proba = sorted(proba, key=lambda p:p[1])
|
||||||
|
|
||||||
|
prev = np.zeros(shape=2, dtype=np.float)
|
||||||
|
n = proba.shape[0]
|
||||||
|
last_prob_sep = 0
|
||||||
|
for b, prob_sep in enumerate(self.prob_separations):
|
||||||
|
proba_b = proba[np.logical_and(proba[:,1] >= last_prob_sep, proba[:,1] < prob_sep)]
|
||||||
|
last_prob_sep=prob_sep
|
||||||
|
if proba_b.size > 0:
|
||||||
|
pcc_b = F.prevalence_from_probabilities(proba_b, binarize=False)
|
||||||
|
adj_matrix = self.buckets[b]
|
||||||
|
pacc_b = ACC.solve_adjustment(adj_matrix, pcc_b)
|
||||||
|
bucket_prev = proba_b.shape[0] / n
|
||||||
|
print(f'bucket {b} -> {F.strprev(pacc_b)} with prop {bucket_prev:.4f}')
|
||||||
|
prev += (pacc_b*bucket_prev)
|
||||||
|
|
||||||
|
print(F.strprev(prev))
|
||||||
|
return prev
|
||||||
|
|
||||||
|
def set_params(self, **parameters):
|
||||||
|
# parameters = {p:v for p,v in parameters.items()}
|
||||||
|
# print(parameters)
|
||||||
|
self.learner.set_params(**parameters)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.learner.get_params()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def classes_(self):
|
def classes_(self):
|
||||||
return self.train.classes_
|
return self.train.classes_
|
|
@ -5,12 +5,21 @@ import sys, os, pathlib
|
||||||
|
|
||||||
class eDiscoveryPlot:
|
class eDiscoveryPlot:
|
||||||
|
|
||||||
def __init__(self, datapath, outdir='./plots', loop=True, save=True):
|
def __init__(self, datapath, outdir='./plots', loop=True, save=True, showYdist=False, showCost=True, refreshEach=10):
|
||||||
self.outdir = outdir
|
self.outdir = outdir
|
||||||
self.datapath = datapath
|
self.datapath = datapath
|
||||||
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
|
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
|
||||||
self.loop = loop
|
self.loop = loop
|
||||||
self.save = save
|
self.save = save
|
||||||
|
self.showYdist = showYdist
|
||||||
|
self.showCost = showCost
|
||||||
|
self.refreshEach = refreshEach
|
||||||
|
|
||||||
|
nPlots = 4
|
||||||
|
if showYdist:
|
||||||
|
nPlots+=1
|
||||||
|
if showCost:
|
||||||
|
nPlots += 1
|
||||||
|
|
||||||
if not loop:
|
if not loop:
|
||||||
plt.rcParams['figure.figsize'] = [12, 12]
|
plt.rcParams['figure.figsize'] = [12, 12]
|
||||||
|
@ -21,11 +30,15 @@ class eDiscoveryPlot:
|
||||||
plt.rcParams.update({'font.size': 15})
|
plt.rcParams.update({'font.size': 15})
|
||||||
|
|
||||||
# plot the data
|
# plot the data
|
||||||
self.fig, self.axs = plt.subplots(5)
|
self.fig, self.axs = plt.subplots(nPlots)
|
||||||
self.calls=0
|
self.calls=0
|
||||||
|
|
||||||
def plot(self, posteriors, y):
|
def plot(self, posteriors, y):
|
||||||
|
|
||||||
|
if (self.calls+1) % self.refreshEach != 0:
|
||||||
|
self.calls+=1
|
||||||
|
return
|
||||||
|
|
||||||
fig, axs = self.fig, self.axs
|
fig, axs = self.fig, self.axs
|
||||||
loop, save = self.loop, self.save
|
loop, save = self.loop, self.save
|
||||||
|
|
||||||
|
@ -73,18 +86,31 @@ class eDiscoveryPlot:
|
||||||
#axs[aXn].set_ylabel('Classifiers performance')
|
#axs[aXn].set_ylabel('Classifiers performance')
|
||||||
#aXn += 1
|
#aXn += 1
|
||||||
|
|
||||||
|
if self.showCost:
|
||||||
|
cost = df['tr-size']
|
||||||
|
idealcost = df['ICost']
|
||||||
|
totalcost = cost + idealcost
|
||||||
|
axs[aXn].plot(xs, cost, label='Cost')
|
||||||
|
axs[aXn].plot(xs, idealcost, label='IdealCost')
|
||||||
|
axs[aXn].plot(xs, totalcost, label='TotalCost')
|
||||||
|
axs[aXn].legend()
|
||||||
|
axs[aXn].grid()
|
||||||
|
axs[aXn].set_ylabel('Cost')
|
||||||
|
aXn += 1
|
||||||
|
|
||||||
# distribution of posterior probabilities in the pool
|
# distribution of posterior probabilities in the pool
|
||||||
positive_posteriors = posteriors[y==1,1]
|
if self.showYdist:
|
||||||
negative_posteriors = posteriors[y==0,1]
|
positive_posteriors = posteriors[y==1,1]
|
||||||
#axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
|
negative_posteriors = posteriors[y==0,1]
|
||||||
# label=['negative', 'positive'])
|
#axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
|
||||||
axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
|
# label=['negative', 'positive'])
|
||||||
axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
|
axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
|
||||||
axs[aXn].legend()
|
axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
|
||||||
axs[aXn].grid()
|
axs[aXn].legend()
|
||||||
axs[aXn].set_xlim(0, 1)
|
axs[aXn].grid()
|
||||||
axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
|
axs[aXn].set_xlim(0, 1)
|
||||||
aXn += 1
|
axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
|
||||||
|
aXn += 1
|
||||||
|
|
||||||
axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
|
axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
|
||||||
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
|
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
|
||||||
|
|
|
@ -226,6 +226,7 @@ class ACC(AggregativeQuantifier):
|
||||||
|
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
|
self.conf_matrix_ = confusion_matrix(y, y_).T
|
||||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
|
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
Loading…
Reference in New Issue