From 6ea627449ce69da85a203dea518f7cd41a38bfef Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 26 Jan 2022 19:04:13 +0100 Subject: [PATCH] adding hist plot --- eDiscovery/functions.py | 21 ++++--- eDiscovery/main.py | 60 +++++++++++--------- eDiscovery/method.py | 119 +++++++++++++++++++++++++++++++++++++--- eDiscovery/plot.py | 47 ++++++++++++---- 4 files changed, 196 insertions(+), 51 deletions(-) diff --git a/eDiscovery/functions.py b/eDiscovery/functions.py index 7d4a3fe..170dfa8 100644 --- a/eDiscovery/functions.py +++ b/eDiscovery/functions.py @@ -6,7 +6,8 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC, SVC import quapy as qp -from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal +from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \ + ClassWeightPCC from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC import numpy as np @@ -27,6 +28,8 @@ def NewQuantifier(quantifiername, classifiername): if quantifiername == 'EMQ': return EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) # return EMQ(NewClassifier(classifier)) + if quantifiername == 'CC': + return CC(NewClassifier(classifiername)) if quantifiername == 'HDy': return HDy(NewClassifier(classifiername)) if quantifiername == 'PCC': @@ -35,14 +38,18 @@ def NewQuantifier(quantifiername, classifiername): return ACC(NewClassifier(classifiername), val_split=0.4) if quantifiername == 'PACC': return PACC(NewClassifier(classifiername), val_split=0.4) - if quantifiername == 'RACC': - return RegionAdjustment(NewClassifier(classifiername), val_split=0.4) - if quantifiername == 'RPACC': - return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10) - if quantifiername == 'GRPACC': + if quantifiername == 'CW': + return ClassWeightPCC() + if quantifiername == 'SRSQ': # supervised regions, then single-label quantification + #q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) + #q = PACC(NewClassifier(classifiername), val_split=0.4) + q = ACC(NewClassifier(classifiername)) + return RegionAdjustmentQ(q, k=4) + if quantifiername == 'URBQ': # unsupervised regions, then binary quantifications def newQ(): # return PACC(NewClassifier(classifiername), val_split=0.4) - return EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) + # return CC(CalibratedClassifierCV(NewClassifier(classifiername))) + return ClassWeightPCC() return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') raise ValueError('unknown quantifier', quantifiername) diff --git a/eDiscovery/main.py b/eDiscovery/main.py index 6581b3d..ded1f62 100644 --- a/eDiscovery/main.py +++ b/eDiscovery/main.py @@ -33,6 +33,8 @@ def main(args): fig = eDiscoveryPlot(args.output) + skip_first_steps = 0 + with qp.util.temp_seed(args.seed): # initial labelled data selection if args.initprev == -1: @@ -61,40 +63,43 @@ def main(args): while True: pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name) - pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) - # q.fit(train) - # pool_p_hat_q = q.quantify(pool.instances) - # q_classifier = q.learner - f1_clf = eval_classifier(classifier, pool) - f1_q = 0 #eval_classifier(q_classifier, pool) - - tr_p = train.prevalence() - te_p = pool.prevalence() nDtr = len(train) nDte = len(pool) - - r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte) - r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte) - r = fn.recall(tr_p, te_p, nDtr, nDte) - tr_te_shift = qp.error.ae(tr_p, te_p) - progress = 100 * nDtr / nD - ae_q = qp.error.ae(te_p, pool_p_hat_q) - ae_cc = qp.error.ae(te_p, pool_p_hat_cc) + if i >= skip_first_steps: + pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) + # q.fit(train) + # pool_p_hat_q = q.quantify(pool.instances) + # q_classifier = q.learner - tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' - f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') + f1_clf = eval_classifier(classifier, pool) + f1_q = 0 #eval_classifier(q_classifier, pool) - fig.plot() + tr_p = train.prevalence() + te_p = pool.prevalence() - if nDte < k: - print('[stop] too few documents remaining') - break - elif i+1 == max_iterations: - print('[stop] maximum number of iterations reached') - break + r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte) + r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte) + r = fn.recall(tr_p, te_p, nDtr, nDte) + tr_te_shift = qp.error.ae(tr_p, te_p) + + ae_q = qp.error.ae(te_p, pool_p_hat_q) + ae_cc = qp.error.ae(te_p, pool_p_hat_cc) + + tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' + f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') + + posteriors = classifier.predict_proba(pool.instances) + fig.plot(posteriors, pool.labels) + + if nDte < k: + print('[stop] too few documents remaining') + break + elif i+1 == max_iterations: + print('[stop] maximum number of iterations reached') + break top_relevant_idx = sampling_fn(pool, classifier, k, progress) train, pool = fn.move_documents(train, pool, top_relevant_idx) @@ -142,4 +147,7 @@ if __name__ == '__main__': if outputdir: qp.util.create_if_not_exist(outputdir) + for k,v in args.__dict__.items(): + print(f'{k}={v}') + main(args) diff --git a/eDiscovery/method.py b/eDiscovery/method.py index 3278e9c..5b74708 100644 --- a/eDiscovery/method.py +++ b/eDiscovery/method.py @@ -3,10 +3,59 @@ import numpy as np from sklearn.base import BaseEstimator, clone from sklearn.cluster import KMeans, OPTICS from sklearn.decomposition import TruncatedSVD +from sklearn.linear_model import LogisticRegression from sklearn.mixture import GaussianMixture from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.data import LabelledCollection -from quapy.method.aggregative import ACC, PACC +from quapy.method.aggregative import ACC, PACC, PCC + + +class RegionAdjustmentQ(BaseQuantifier): + + def __init__(self, quantifier: BaseQuantifier, k=10): + self.quantifier = quantifier + self.k = k # number of regions + + def fit(self, data: LabelledCollection): + X, y = data.Xy + Xp, Xn = X[y==1], X[y==0] + + nk_per_class = (data.prevalence() * self.k).round().astype(int) + print(f'number of regions per class {nk_per_class}') + + kmeans_neg = KMeans(n_clusters=nk_per_class[0]) + rn = kmeans_neg.fit_predict(Xn) # regions negative + + kmeans_pos = KMeans(n_clusters=nk_per_class[1]) + rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive + + classes = np.arange(self.k) + pos = LabelledCollection(Xp, rp, classes_=classes) + neg = LabelledCollection(Xn, rn, classes_=classes) + + region_data = pos + neg + self.quantifier.fit(region_data) + + self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)} + + return self + + def quantify(self, instances): + region_prevalence = self.quantifier.quantify(instances) + bin_prevalence = np.zeros(shape=2, dtype=np.float) + for r, prev in enumerate(region_prevalence): + bin_prevalence[self.reg2class[r]] += prev + return bin_prevalence + + def set_params(self, **parameters): + pass + + def get_params(self, deep=True): + pass + + @property + def classes_(self): + return np.asarray([0,1]) class RegionAdjustment(ACC): @@ -20,15 +69,25 @@ class RegionAdjustment(ACC): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): X, y = data.Xy Xp, Xn = X[y==1], X[y==0] - kmeans = KMeans(n_clusters=self.k) - rn = kmeans.fit_predict(Xn) # regions negative - rp = kmeans.fit_predict(Xp)+self.k # regions positive - classes = np.arange(self.k*2) + + nk_per_class = (data.prevalence() * self.k).round().astype(int) + print(f'number of clusters per class {nk_per_class}') + + kmeans_neg = KMeans(n_clusters=nk_per_class[0]) + rn = kmeans_neg.fit_predict(Xn) # regions negative + + kmeans_pos = KMeans(n_clusters=nk_per_class[1]) + rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive + + classes = np.arange(self.k) pos = LabelledCollection(Xp, rp, classes_=classes) neg = LabelledCollection(Xn, rn, classes_=classes) + region_data = pos + neg - super(RegionAdjustment, self).fit(region_data, fit_learner, val_split) - self.reg2class = {r:(0 if r < self.k else 1) for r in range(2*self.k)} + super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split) + + self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)} + return self def classify(self, data): @@ -218,4 +277,48 @@ class TrivialAcceptorQuantifier(BinaryQuantifier): @property def classes_(self): - return np.asarray([0,1]) \ No newline at end of file + return np.asarray([0,1]) + + +class ClassWeightPCC(BaseQuantifier): + + def __init__(self, estimator=LogisticRegression): + self.estimator = estimator + self.learner = PACC(self.estimator()) + self.deployed = False + + def fit(self, data: LabelledCollection, fit_learner=True): + self.train = data + self.learner.fit(self.train) + return self + + def quantify(self, instances): + guessed_prevalence = self.learner.quantify(instances) + class_weight = self._get_class_weight(guessed_prevalence) + base_estimator = clone(self.learner.learner) + base_estimator.set_params(class_weight=class_weight) + pcc = PCC(base_estimator) + return pcc.fit(self.train).quantify(instances) + + def _get_class_weight(self, prevalence): + # class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence)) + # return {0: class_weight[1], 1: class_weight[0]} + # weights = prevalence/prevalence.min() + weights = prevalence / self.train.prevalence() + normfactor = weights.min() + if normfactor <= 0: + normfactor = 1E-3 + weights /= normfactor + return {0:weights[0], 1:weights[1]} + + def set_params(self, **parameters): + # parameters = {p:v for p,v in parameters.items()} + # print(parameters) + self.learner.set_params(**parameters) + + def get_params(self, deep=True): + return self.learner.get_params() + + @property + def classes_(self): + return self.train.classes_ \ No newline at end of file diff --git a/eDiscovery/plot.py b/eDiscovery/plot.py index 064d98f..0d1c4ed 100644 --- a/eDiscovery/plot.py +++ b/eDiscovery/plot.py @@ -16,14 +16,16 @@ class eDiscoveryPlot: plt.rcParams['figure.figsize'] = [12, 12] plt.rcParams['figure.dpi'] = 200 else: - plt.rcParams['figure.figsize'] = [17, 17] - plt.rcParams['figure.dpi'] = 60 - + plt.rcParams['figure.figsize'] = [14, 18] + plt.rcParams['figure.dpi'] = 50 + plt.rcParams.update({'font.size': 15}) # plot the data self.fig, self.axs = plt.subplots(5) + self.calls=0 + + def plot(self, posteriors, y): - def plot(self): fig, axs = self.fig, self.axs loop, save = self.loop, self.save @@ -38,7 +40,6 @@ class eDiscoveryPlot: axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$') axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$') axs[aXn].plot(xs, y_r, label='$R$') - axs[aXn].legend() axs[aXn].grid() axs[aXn].set_ylabel('Recall') axs[aXn].set_ylim(0, 1) @@ -52,7 +53,7 @@ class eDiscoveryPlot: axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$') axs[aXn].legend() axs[aXn].grid() - axs[aXn].set_ylabel('Prevalence') + axs[aXn].set_ylabel('Pool prevalence') aXn += 1 y_ae = df['AE'] @@ -64,14 +65,28 @@ class eDiscoveryPlot: axs[aXn].set_ylabel('Quantification error') aXn += 1 - axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$') - axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$') + # classifier performance (not very reliable) + #axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$') + #axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$') + #axs[aXn].legend() + #axs[aXn].grid() + #axs[aXn].set_ylabel('Classifiers performance') + #aXn += 1 + + # distribution of posterior probabilities in the pool + positive_posteriors = posteriors[y==1,1] + negative_posteriors = posteriors[y==0,1] + #axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50, + # label=['negative', 'positive']) + axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75) + axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75) axs[aXn].legend() axs[aXn].grid() - axs[aXn].set_ylabel('Classifiers performance') + axs[aXn].set_xlim(0, 1) + axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution') aXn += 1 - axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)') + axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)') axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$') axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$') axs[aXn].legend() @@ -79,6 +94,16 @@ class eDiscoveryPlot: axs[aXn].set_ylabel('Train-Test Shift') aXn += 1 + for i in range(aXn): + if self.calls==0: + # Shrink current axis by 20% + box = axs[i].get_position() + axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height]) + fig.tight_layout() + + # Put a legend to the right of the current axis + axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5)) + if save: os.makedirs(self.outdir, exist_ok=True) plt.savefig(f'{self.outdir}/{self.plotname}') @@ -88,6 +113,8 @@ class eDiscoveryPlot: for i in range(aXn): axs[i].cla() + self.calls += 1 + if __name__ == '__main__':