diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py index 32a41a0..45e68f9 100644 --- a/eDiscovery/functions.py +++ b/eDiscovery/functions.py @@ -3,11 +3,12 @@ import sklearn from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression +from sklearn.metrics import f1_score from sklearn.svm import LinearSVC, SVC import quapy as qp from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \ - ClassWeightPCC + ClassWeightPCC, PosteriorConditionalAdjustemnt from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC import numpy as np @@ -49,9 +50,11 @@ def NewQuantifier(quantifiername, classifiername): def newQ(): # return PACC(NewClassifier(classifiername), val_split=0.4) # return CC(CalibratedClassifierCV(NewClassifier(classifiername))) - # return ClassWeightPCC() - return CC(NewClassifier(classifiername)) - return RegionProbAdjustmentGlobal(newQ, k=20, clustering='kmeans') + return ClassWeightPCC() + return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') + if quantifiername == 'PCAD': # posterior-conditional adjustment + return PosteriorConditionalAdjustemnt() + raise ValueError('unknown quantifier', quantifiername) @@ -172,6 +175,26 @@ def estimate_prev_Q(train, pool, quantifiername, classifiername): q.fit(train) prev = q.quantify(pool.instances) - return prev, None + return prev, q +def eval_classifier(learner, test:LabelledCollection): + predictions = learner.predict(test.instances) + true_labels = test.labels + # f1 = f1_score(true_labels, predictions, average='macro') + f1 = f1_score(true_labels, predictions, average='binary') + # f1 = (true_labels==predictions).mean() + return f1 + + +def ideal_cost(classifier, pool): + # returns the cost (in terms of number of documents) to review until the last relevant document + # is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since + # one assumes to be able to stop reviewing when the last relevant is encountered + + prob = classifier.predict_proba(pool.instances) + order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance" + ranked_labels = pool.labels[order] + num_relevant = np.sum(pool.labels) + idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min() + return idealized_cost \ No newline at end of file diff --git a/eDiscovery/main.py b/eDiscovery/main.py index ded1f62..0a740c1 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -9,14 +9,6 @@ from quapy.data import LabelledCollection from plot import eDiscoveryPlot -def eval_classifier(learner, test:LabelledCollection): - predictions = learner.predict(test.instances) - true_labels = test.labels - # f1 = f1_score(true_labels, predictions, average='macro') - f1 = f1_score(true_labels, predictions, average='binary') - # f1 = (true_labels==predictions).mean() - return f1 - def main(args): @@ -33,7 +25,7 @@ def main(args): fig = eDiscoveryPlot(args.output) - skip_first_steps = 0 + skip_first_steps = 20 with qp.util.temp_seed(args.seed): # initial labelled data selection @@ -58,28 +50,31 @@ def main(args): foo.flush() print(msg) - tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf') + tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC\tMF1_Q\tMF1_Clf\tICost\tremaining') while True: pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name) + ideal_cost = fn.ideal_cost(classifier, pool) nDtr = len(train) nDte = len(pool) progress = 100 * nDtr / nD if i >= skip_first_steps: - pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) - # q.fit(train) - # pool_p_hat_q = q.quantify(pool.instances) - # q_classifier = q.learner + pool_p_hat_q, q = fn.estimate_prev_Q(train, pool, q_name, clf_name) - f1_clf = eval_classifier(classifier, pool) + f1_clf = 0 # eval_classifier(classifier, pool) f1_q = 0 #eval_classifier(q_classifier, pool) tr_p = train.prevalence() te_p = pool.prevalence() + # this is based on an observation by D.Lewis "it is convenient to have the same kind of systematic" + # error both in the numerator and in the denominator + #tr_p_hat = q.quantify(train.instances) + #r_hat_q = fn.recall(tr_p_hat, pool_p_hat_q, nDtr, nDte) + r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte) r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte) r = fn.recall(tr_p, te_p, nDtr, nDte) @@ -89,7 +84,8 @@ def main(args): ae_cc = qp.error.ae(te_p, pool_p_hat_cc) tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' - f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') + f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}' + f'\t{ideal_cost}\t{pool.labels.sum()}') posteriors = classifier.predict_proba(pool.instances) fig.plot(posteriors, pool.labels) @@ -120,10 +116,10 @@ if __name__ == '__main__': parser.add_argument('--k', metavar='BATCH', type=int, help='number of documents in a batch', default=100) parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning', - default=1000) + default=2) parser.add_argument('--initprev', metavar='PREV', type=float, - help='prevalence of the initial sample (-1 for uniform sampling, default)', - default=-1) + help='prevalence of the initial sample (-1 for uniform sampling)', + default=0.5) parser.add_argument('--seed', metavar='SEED', type=int, help='random seed', default=1) diff --git a/eDiscovery/method.py b/eDiscovery/method.py index e781fb4..e611182 100644 --- a/eDiscovery/method.py +++ b/eDiscovery/method.py @@ -5,9 +5,12 @@ from sklearn.cluster import KMeans, OPTICS from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import LogisticRegression from sklearn.mixture import GaussianMixture +from sklearn.model_selection import cross_val_predict + from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.data import LabelledCollection from quapy.method.aggregative import ACC, PACC, PCC +import quapy.functional as F class RegionAdjustmentQ(BaseQuantifier): @@ -321,6 +324,89 @@ class ClassWeightPCC(BaseQuantifier): def get_params(self, deep=True): return self.learner.get_params() + @property + def classes_(self): + return self.train.classes_ + + +class PosteriorConditionalAdjustemnt(BaseQuantifier): + + def __init__(self): + self.estimator = LogisticRegression() + self.k = 3 + + def get_adjustment_matrix(self, y, prob): + n_classes = 2 + classes = [0, 1] + confusion = np.empty(shape=(n_classes, n_classes)) + for i, class_ in enumerate(classes): + index = y == class_ + if any(index): + confusion[i] = prob[index].mean(axis=0) + else: + if i == 0: + confusion[i] = np.asarray([1,0]) + else: + confusion[i] = np.asarray([0, 1]) + + confusion = confusion.T + return confusion + + def fit(self, data: LabelledCollection, fit_learner=True): + X, y = data.Xy + proba = cross_val_predict(self.estimator, X, y, n_jobs=-1, method='predict_proba') + + order = np.argsort(proba[:,1]) + proba = proba[order] + y = y[order] + X = X[order] # to keep the alignment for the final classifier + n = len(data) + bucket_size = n // self.k + bucket_remainder = n % bucket_size + self.buckets = {} + self.prob_separations = [] + for bucket in range(self.k): + from_pos = bucket*bucket_size + to_pos = (bucket+1)*bucket_size + (bucket_remainder if bucket==self.k-1 else 0) + slice_b = slice(from_pos, to_pos) + y_b = y[slice_b] + proba_b = proba[slice_b] + self.buckets[bucket] = self.get_adjustment_matrix(y_b, proba_b) + self.prob_separations.append(proba_b[-1,1]) + self.prob_separations[-1] = 1 # the last one should account for the entire prob + + self.estimator.fit(X,y) + return self + + def quantify(self, instances): + proba = self.estimator.predict_proba(instances) + #proba = sorted(proba, key=lambda p:p[1]) + + prev = np.zeros(shape=2, dtype=np.float) + n = proba.shape[0] + last_prob_sep = 0 + for b, prob_sep in enumerate(self.prob_separations): + proba_b = proba[np.logical_and(proba[:,1] >= last_prob_sep, proba[:,1] < prob_sep)] + last_prob_sep=prob_sep + if proba_b.size > 0: + pcc_b = F.prevalence_from_probabilities(proba_b, binarize=False) + adj_matrix = self.buckets[b] + pacc_b = ACC.solve_adjustment(adj_matrix, pcc_b) + bucket_prev = proba_b.shape[0] / n + print(f'bucket {b} -> {F.strprev(pacc_b)} with prop {bucket_prev:.4f}') + prev += (pacc_b*bucket_prev) + + print(F.strprev(prev)) + return prev + + def set_params(self, **parameters): + # parameters = {p:v for p,v in parameters.items()} + # print(parameters) + self.learner.set_params(**parameters) + + def get_params(self, deep=True): + return self.learner.get_params() + @property def classes_(self): return self.train.classes_ \ No newline at end of file diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py index 0d1c4ed..67f33ca 100644 --- a/eDiscovery/plot.py +++ b/eDiscovery/plot.py @@ -5,12 +5,21 @@ import sys, os, pathlib class eDiscoveryPlot: - def __init__(self, datapath, outdir='./plots', loop=True, save=True): + def __init__(self, datapath, outdir='./plots', loop=True, save=True, showYdist=False, showCost=True, refreshEach=10): self.outdir = outdir self.datapath = datapath self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png") self.loop = loop self.save = save + self.showYdist = showYdist + self.showCost = showCost + self.refreshEach = refreshEach + + nPlots = 4 + if showYdist: + nPlots+=1 + if showCost: + nPlots += 1 if not loop: plt.rcParams['figure.figsize'] = [12, 12] @@ -21,11 +30,15 @@ class eDiscoveryPlot: plt.rcParams.update({'font.size': 15}) # plot the data - self.fig, self.axs = plt.subplots(5) + self.fig, self.axs = plt.subplots(nPlots) self.calls=0 def plot(self, posteriors, y): + if (self.calls+1) % self.refreshEach != 0: + self.calls+=1 + return + fig, axs = self.fig, self.axs loop, save = self.loop, self.save @@ -73,18 +86,31 @@ class eDiscoveryPlot: #axs[aXn].set_ylabel('Classifiers performance') #aXn += 1 + if self.showCost: + cost = df['tr-size'] + idealcost = df['ICost'] + totalcost = cost + idealcost + axs[aXn].plot(xs, cost, label='Cost') + axs[aXn].plot(xs, idealcost, label='IdealCost') + axs[aXn].plot(xs, totalcost, label='TotalCost') + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_ylabel('Cost') + aXn += 1 + # distribution of posterior probabilities in the pool - positive_posteriors = posteriors[y==1,1] - negative_posteriors = posteriors[y==0,1] - #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50, - # label=['negative', 'positive']) - axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75) - axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75) - axs[aXn].legend() - axs[aXn].grid() - axs[aXn].set_xlim(0, 1) - axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution') - aXn += 1 + if self.showYdist: + positive_posteriors = posteriors[y==1,1] + negative_posteriors = posteriors[y==0,1] + #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50, + # label=['negative', 'positive']) + axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75) + axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75) + axs[aXn].legend() + axs[aXn].grid() + axs[aXn].set_xlim(0, 1) + axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution') + aXn += 1 axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)') axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$') diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 08ab4c3..ce51998 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -226,6 +226,7 @@ class ACC(AggregativeQuantifier): # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi + self.conf_matrix_ = confusion_matrix(y, y_).T self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count return self