1
0
Fork 0

trying to create a quantifier that applies local adjustment conditioned on the posterior probabilities, which is not working

This commit is contained in:
Alejandro Moreo Fernandez 2022-02-15 18:24:45 +01:00
parent 31fb154e01
commit 8a48f679a7
5 changed files with 164 additions and 34 deletions

View File

@ -3,11 +3,12 @@ import sklearn
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC, SVC from sklearn.svm import LinearSVC, SVC
import quapy as qp import quapy as qp
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \ from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
ClassWeightPCC ClassWeightPCC, PosteriorConditionalAdjustemnt
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
import numpy as np import numpy as np
@ -51,6 +52,8 @@ def NewQuantifier(quantifiername, classifiername):
# return CC(CalibratedClassifierCV(NewClassifier(classifiername))) # return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
return ClassWeightPCC() return ClassWeightPCC()
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
if quantifiername == 'PCAD': # posterior-conditional adjustment
return PosteriorConditionalAdjustemnt()
raise ValueError('unknown quantifier', quantifiername) raise ValueError('unknown quantifier', quantifiername)
@ -171,6 +174,26 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername):
q.fit(train) q.fit(train)
prev = q.quantify(pool.instances) prev = q.quantify(pool.instances)
return prev, None return prev, q
def eval_classifier(learner, test:LabelledCollection):
predictions = learner.predict(test.instances)
true_labels = test.labels
# f1 = f1_score(true_labels, predictions, average='macro')
f1 = f1_score(true_labels, predictions, average='binary')
# f1 = (true_labels==predictions).mean()
return f1
def ideal_cost(classifier, pool):
# returns the cost (in terms of number of documents) to review until the last relevant document
# is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
# one assumes to be able to stop reviewing when the last relevant is encountered
prob = classifier.predict_proba(pool.instances)
order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance"
ranked_labels = pool.labels[order]
num_relevant = np.sum(pool.labels)
idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min()
return idealized_cost

View File

@ -9,14 +9,6 @@ from quapy.data import LabelledCollection
from plot import eDiscoveryPlot from plot import eDiscoveryPlot
def eval_classifier(learner, test:LabelledCollection):
predictions = learner.predict(test.instances)
true_labels = test.labels
# f1 = f1_score(true_labels, predictions, average='macro')
f1 = f1_score(true_labels, predictions, average='binary')
# f1 = (true_labels==predictions).mean()
return f1
def main(args): def main(args):
@ -33,7 +25,7 @@ def main(args):
fig = eDiscoveryPlot(args.output) fig = eDiscoveryPlot(args.output)
skip_first_steps = 0 skip_first_steps = 20
with qp.util.temp_seed(args.seed): with qp.util.temp_seed(args.seed):
# initial labelled data selection # initial labelled data selection
@ -58,28 +50,31 @@ def main(args):
foo.flush() foo.flush()
print(msg) print(msg)
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf') tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining')
while True: while True:
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name) pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
ideal_cost = fn.ideal_cost(classifier, pool)
nDtr = len(train) nDtr = len(train)
nDte = len(pool) nDte = len(pool)
progress = 100 * nDtr / nD progress = 100 * nDtr / nD
if i >= skip_first_steps: if i >= skip_first_steps:
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) pool_p_hat_q, q = fn.estimate_prev_Q(train, pool, q_name, clf_name)
# q.fit(train)
# pool_p_hat_q = q.quantify(pool.instances)
# q_classifier = q.learner
f1_clf = eval_classifier(classifier, pool) f1_clf = 0 # eval_classifier(classifier, pool)
f1_q = 0 #eval_classifier(q_classifier, pool) f1_q = 0 #eval_classifier(q_classifier, pool)
tr_p = train.prevalence() tr_p = train.prevalence()
te_p = pool.prevalence() te_p = pool.prevalence()
# this is based on an observation by D.Lewis "it is convenient to have the same kind of systematic"
# error both in the numerator and in the denominator
#tr_p_hat = q.quantify(train.instances)
#r_hat_q = fn.recall(tr_p_hat, pool_p_hat_q, nDtr, nDte)
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte) r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte) r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
r = fn.recall(tr_p, te_p, nDtr, nDte) r = fn.recall(tr_p, te_p, nDtr, nDte)
@ -89,9 +84,8 @@ def main(args):
ae_cc = qp.error.ae(te_p, pool_p_hat_cc) ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
f'\t{ideal_cost}\t{pool.labels.sum()}')
raise Exception('add idealized costs for each iteration and plots')
posteriors = classifier.predict_proba(pool.instances) posteriors = classifier.predict_proba(pool.instances)
fig.plot(posteriors, pool.labels) fig.plot(posteriors, pool.labels)
@ -124,7 +118,7 @@ if __name__ == '__main__':
parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning', parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
default=2) default=2)
parser.add_argument('--initprev', metavar='PREV', type=float, parser.add_argument('--initprev', metavar='PREV', type=float,
help='prevalence of the initial sample (-1 for uniform sampling, default)', help='prevalence of the initial sample (-1 for uniform sampling)',
default=0.5) default=0.5)
parser.add_argument('--seed', metavar='SEED', type=int, parser.add_argument('--seed', metavar='SEED', type=int,
help='random seed', help='random seed',

View File

@ -5,9 +5,12 @@ from sklearn.cluster import KMeans, OPTICS
from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture from sklearn.mixture import GaussianMixture
from sklearn.model_selection import cross_val_predict
from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import ACC, PACC, PCC from quapy.method.aggregative import ACC, PACC, PCC
import quapy.functional as F
class RegionAdjustmentQ(BaseQuantifier): class RegionAdjustmentQ(BaseQuantifier):
@ -319,6 +322,89 @@ class ClassWeightPCC(BaseQuantifier):
def get_params(self, deep=True): def get_params(self, deep=True):
return self.learner.get_params() return self.learner.get_params()
@property
def classes_(self):
return self.train.classes_
class PosteriorConditionalAdjustemnt(BaseQuantifier):
def __init__(self):
self.estimator = LogisticRegression()
self.k = 3
def get_adjustment_matrix(self, y, prob):
n_classes = 2
classes = [0, 1]
confusion = np.empty(shape=(n_classes, n_classes))
for i, class_ in enumerate(classes):
index = y == class_
if any(index):
confusion[i] = prob[index].mean(axis=0)
else:
if i == 0:
confusion[i] = np.asarray([1,0])
else:
confusion[i] = np.asarray([0, 1])
confusion = confusion.T
return confusion
def fit(self, data: LabelledCollection, fit_learner=True):
X, y = data.Xy
proba = cross_val_predict(self.estimator, X, y, n_jobs=-1, method='predict_proba')
order = np.argsort(proba[:,1])
proba = proba[order]
y = y[order]
X = X[order] # to keep the alignment for the final classifier
n = len(data)
bucket_size = n // self.k
bucket_remainder = n % bucket_size
self.buckets = {}
self.prob_separations = []
for bucket in range(self.k):
from_pos = bucket*bucket_size
to_pos = (bucket+1)*bucket_size + (bucket_remainder if bucket==self.k-1 else 0)
slice_b = slice(from_pos, to_pos)
y_b = y[slice_b]
proba_b = proba[slice_b]
self.buckets[bucket] = self.get_adjustment_matrix(y_b, proba_b)
self.prob_separations.append(proba_b[-1,1])
self.prob_separations[-1] = 1 # the last one should account for the entire prob
self.estimator.fit(X,y)
return self
def quantify(self, instances):
proba = self.estimator.predict_proba(instances)
#proba = sorted(proba, key=lambda p:p[1])
prev = np.zeros(shape=2, dtype=np.float)
n = proba.shape[0]
last_prob_sep = 0
for b, prob_sep in enumerate(self.prob_separations):
proba_b = proba[np.logical_and(proba[:,1] >= last_prob_sep, proba[:,1] < prob_sep)]
last_prob_sep=prob_sep
if proba_b.size > 0:
pcc_b = F.prevalence_from_probabilities(proba_b, binarize=False)
adj_matrix = self.buckets[b]
pacc_b = ACC.solve_adjustment(adj_matrix, pcc_b)
bucket_prev = proba_b.shape[0] / n
print(f'bucket {b} -> {F.strprev(pacc_b)} with prop {bucket_prev:.4f}')
prev += (pacc_b*bucket_prev)
print(F.strprev(prev))
return prev
def set_params(self, **parameters):
# parameters = {p:v for p,v in parameters.items()}
# print(parameters)
self.learner.set_params(**parameters)
def get_params(self, deep=True):
return self.learner.get_params()
@property @property
def classes_(self): def classes_(self):
return self.train.classes_ return self.train.classes_

View File

@ -5,12 +5,21 @@ import sys, os, pathlib
class eDiscoveryPlot: class eDiscoveryPlot:
def __init__(self, datapath, outdir='./plots', loop=True, save=True): def __init__(self, datapath, outdir='./plots', loop=True, save=True, showYdist=False, showCost=True, refreshEach=10):
self.outdir = outdir self.outdir = outdir
self.datapath = datapath self.datapath = datapath
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png") self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
self.loop = loop self.loop = loop
self.save = save self.save = save
self.showYdist = showYdist
self.showCost = showCost
self.refreshEach = refreshEach
nPlots = 4
if showYdist:
nPlots+=1
if showCost:
nPlots += 1
if not loop: if not loop:
plt.rcParams['figure.figsize'] = [12, 12] plt.rcParams['figure.figsize'] = [12, 12]
@ -21,11 +30,15 @@ class eDiscoveryPlot:
plt.rcParams.update({'font.size': 15}) plt.rcParams.update({'font.size': 15})
# plot the data # plot the data
self.fig, self.axs = plt.subplots(5) self.fig, self.axs = plt.subplots(nPlots)
self.calls=0 self.calls=0
def plot(self, posteriors, y): def plot(self, posteriors, y):
if (self.calls+1) % self.refreshEach != 0:
self.calls+=1
return
fig, axs = self.fig, self.axs fig, axs = self.fig, self.axs
loop, save = self.loop, self.save loop, save = self.loop, self.save
@ -73,18 +86,31 @@ class eDiscoveryPlot:
#axs[aXn].set_ylabel('Classifiers performance') #axs[aXn].set_ylabel('Classifiers performance')
#aXn += 1 #aXn += 1
if self.showCost:
cost = df['tr-size']
idealcost = df['ICost']
totalcost = cost + idealcost
axs[aXn].plot(xs, cost, label='Cost')
axs[aXn].plot(xs, idealcost, label='IdealCost')
axs[aXn].plot(xs, totalcost, label='TotalCost')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Cost')
aXn += 1
# distribution of posterior probabilities in the pool # distribution of posterior probabilities in the pool
positive_posteriors = posteriors[y==1,1] if self.showYdist:
negative_posteriors = posteriors[y==0,1] positive_posteriors = posteriors[y==1,1]
#axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50, negative_posteriors = posteriors[y==0,1]
# label=['negative', 'positive']) #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75) # label=['negative', 'positive'])
axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75) axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
axs[aXn].legend() axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
axs[aXn].grid() axs[aXn].legend()
axs[aXn].set_xlim(0, 1) axs[aXn].grid()
axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution') axs[aXn].set_xlim(0, 1)
aXn += 1 axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
aXn += 1
axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)') axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$') axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')

View File

@ -226,6 +226,7 @@ class ACC(AggregativeQuantifier):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
self.conf_matrix_ = confusion_matrix(y, y_).T
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
return self return self