1
0
Fork 0

adding hist plot

This commit is contained in:
Alejandro Moreo Fernandez 2022-01-26 19:04:13 +01:00
parent 4d4cf6eb3f
commit 6ea627449c
4 changed files with 196 additions and 51 deletions

View File

@ -6,7 +6,8 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC from sklearn.svm import LinearSVC, SVC
import quapy as qp import quapy as qp
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
ClassWeightPCC
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
import numpy as np import numpy as np
@ -27,6 +28,8 @@ def NewQuantifier(quantifiername, classifiername):
if quantifiername == 'EMQ': if quantifiername == 'EMQ':
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
# return EMQ(NewClassifier(classifier)) # return EMQ(NewClassifier(classifier))
if quantifiername == 'CC':
return CC(NewClassifier(classifiername))
if quantifiername == 'HDy': if quantifiername == 'HDy':
return HDy(NewClassifier(classifiername)) return HDy(NewClassifier(classifiername))
if quantifiername == 'PCC': if quantifiername == 'PCC':
@ -35,14 +38,18 @@ def NewQuantifier(quantifiername, classifiername):
return ACC(NewClassifier(classifiername), val_split=0.4) return ACC(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'PACC': if quantifiername == 'PACC':
return PACC(NewClassifier(classifiername), val_split=0.4) return PACC(NewClassifier(classifiername), val_split=0.4)
if quantifiername == 'RACC': if quantifiername == 'CW':
return RegionAdjustment(NewClassifier(classifiername), val_split=0.4) return ClassWeightPCC()
if quantifiername == 'RPACC': if quantifiername == 'SRSQ': # supervised regions, then single-label quantification
return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10) #q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
if quantifiername == 'GRPACC': #q = PACC(NewClassifier(classifiername), val_split=0.4)
q = ACC(NewClassifier(classifiername))
return RegionAdjustmentQ(q, k=4)
if quantifiername == 'URBQ': # unsupervised regions, then binary quantifications
def newQ(): def newQ():
# return PACC(NewClassifier(classifiername), val_split=0.4) # return PACC(NewClassifier(classifiername), val_split=0.4)
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) # return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
return ClassWeightPCC()
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
raise ValueError('unknown quantifier', quantifiername) raise ValueError('unknown quantifier', quantifiername)

View File

@ -33,6 +33,8 @@ def main(args):
fig = eDiscoveryPlot(args.output) fig = eDiscoveryPlot(args.output)
skip_first_steps = 0
with qp.util.temp_seed(args.seed): with qp.util.temp_seed(args.seed):
# initial labelled data selection # initial labelled data selection
if args.initprev == -1: if args.initprev == -1:
@ -61,6 +63,12 @@ def main(args):
while True: while True:
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name) pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
nDtr = len(train)
nDte = len(pool)
progress = 100 * nDtr / nD
if i >= skip_first_steps:
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name) pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
# q.fit(train) # q.fit(train)
# pool_p_hat_q = q.quantify(pool.instances) # pool_p_hat_q = q.quantify(pool.instances)
@ -71,23 +79,20 @@ def main(args):
tr_p = train.prevalence() tr_p = train.prevalence()
te_p = pool.prevalence() te_p = pool.prevalence()
nDtr = len(train)
nDte = len(pool)
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte) r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte) r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
r = fn.recall(tr_p, te_p, nDtr, nDte) r = fn.recall(tr_p, te_p, nDtr, nDte)
tr_te_shift = qp.error.ae(tr_p, te_p) tr_te_shift = qp.error.ae(tr_p, te_p)
progress = 100 * nDtr / nD
ae_q = qp.error.ae(te_p, pool_p_hat_q) ae_q = qp.error.ae(te_p, pool_p_hat_q)
ae_cc = qp.error.ae(te_p, pool_p_hat_cc) ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}') f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
fig.plot() posteriors = classifier.predict_proba(pool.instances)
fig.plot(posteriors, pool.labels)
if nDte < k: if nDte < k:
print('[stop] too few documents remaining') print('[stop] too few documents remaining')
@ -142,4 +147,7 @@ if __name__ == '__main__':
if outputdir: if outputdir:
qp.util.create_if_not_exist(outputdir) qp.util.create_if_not_exist(outputdir)
for k,v in args.__dict__.items():
print(f'{k}={v}')
main(args) main(args)

View File

@ -3,10 +3,59 @@ import numpy as np
from sklearn.base import BaseEstimator, clone from sklearn.base import BaseEstimator, clone
from sklearn.cluster import KMeans, OPTICS from sklearn.cluster import KMeans, OPTICS
from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture from sklearn.mixture import GaussianMixture
from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import ACC, PACC from quapy.method.aggregative import ACC, PACC, PCC
class RegionAdjustmentQ(BaseQuantifier):
def __init__(self, quantifier: BaseQuantifier, k=10):
self.quantifier = quantifier
self.k = k # number of regions
def fit(self, data: LabelledCollection):
X, y = data.Xy
Xp, Xn = X[y==1], X[y==0]
nk_per_class = (data.prevalence() * self.k).round().astype(int)
print(f'number of regions per class {nk_per_class}')
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
rn = kmeans_neg.fit_predict(Xn) # regions negative
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive
classes = np.arange(self.k)
pos = LabelledCollection(Xp, rp, classes_=classes)
neg = LabelledCollection(Xn, rn, classes_=classes)
region_data = pos + neg
self.quantifier.fit(region_data)
self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
return self
def quantify(self, instances):
region_prevalence = self.quantifier.quantify(instances)
bin_prevalence = np.zeros(shape=2, dtype=np.float)
for r, prev in enumerate(region_prevalence):
bin_prevalence[self.reg2class[r]] += prev
return bin_prevalence
def set_params(self, **parameters):
pass
def get_params(self, deep=True):
pass
@property
def classes_(self):
return np.asarray([0,1])
class RegionAdjustment(ACC): class RegionAdjustment(ACC):
@ -20,15 +69,25 @@ class RegionAdjustment(ACC):
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
X, y = data.Xy X, y = data.Xy
Xp, Xn = X[y==1], X[y==0] Xp, Xn = X[y==1], X[y==0]
kmeans = KMeans(n_clusters=self.k)
rn = kmeans.fit_predict(Xn) # regions negative nk_per_class = (data.prevalence() * self.k).round().astype(int)
rp = kmeans.fit_predict(Xp)+self.k # regions positive print(f'number of clusters per class {nk_per_class}')
classes = np.arange(self.k*2)
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
rn = kmeans_neg.fit_predict(Xn) # regions negative
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive
classes = np.arange(self.k)
pos = LabelledCollection(Xp, rp, classes_=classes) pos = LabelledCollection(Xp, rp, classes_=classes)
neg = LabelledCollection(Xn, rn, classes_=classes) neg = LabelledCollection(Xn, rn, classes_=classes)
region_data = pos + neg region_data = pos + neg
super(RegionAdjustment, self).fit(region_data, fit_learner, val_split) super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split)
self.reg2class = {r:(0 if r < self.k else 1) for r in range(2*self.k)}
self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
return self return self
def classify(self, data): def classify(self, data):
@ -219,3 +278,47 @@ class TrivialAcceptorQuantifier(BinaryQuantifier):
@property @property
def classes_(self): def classes_(self):
return np.asarray([0,1]) return np.asarray([0,1])
class ClassWeightPCC(BaseQuantifier):
def __init__(self, estimator=LogisticRegression):
self.estimator = estimator
self.learner = PACC(self.estimator())
self.deployed = False
def fit(self, data: LabelledCollection, fit_learner=True):
self.train = data
self.learner.fit(self.train)
return self
def quantify(self, instances):
guessed_prevalence = self.learner.quantify(instances)
class_weight = self._get_class_weight(guessed_prevalence)
base_estimator = clone(self.learner.learner)
base_estimator.set_params(class_weight=class_weight)
pcc = PCC(base_estimator)
return pcc.fit(self.train).quantify(instances)
def _get_class_weight(self, prevalence):
# class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence))
# return {0: class_weight[1], 1: class_weight[0]}
# weights = prevalence/prevalence.min()
weights = prevalence / self.train.prevalence()
normfactor = weights.min()
if normfactor <= 0:
normfactor = 1E-3
weights /= normfactor
return {0:weights[0], 1:weights[1]}
def set_params(self, **parameters):
# parameters = {p:v for p,v in parameters.items()}
# print(parameters)
self.learner.set_params(**parameters)
def get_params(self, deep=True):
return self.learner.get_params()
@property
def classes_(self):
return self.train.classes_

View File

@ -16,14 +16,16 @@ class eDiscoveryPlot:
plt.rcParams['figure.figsize'] = [12, 12] plt.rcParams['figure.figsize'] = [12, 12]
plt.rcParams['figure.dpi'] = 200 plt.rcParams['figure.dpi'] = 200
else: else:
plt.rcParams['figure.figsize'] = [17, 17] plt.rcParams['figure.figsize'] = [14, 18]
plt.rcParams['figure.dpi'] = 60 plt.rcParams['figure.dpi'] = 50
plt.rcParams.update({'font.size': 15})
# plot the data # plot the data
self.fig, self.axs = plt.subplots(5) self.fig, self.axs = plt.subplots(5)
self.calls=0
def plot(self, posteriors, y):
def plot(self):
fig, axs = self.fig, self.axs fig, axs = self.fig, self.axs
loop, save = self.loop, self.save loop, save = self.loop, self.save
@ -38,7 +40,6 @@ class eDiscoveryPlot:
axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$') axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$') axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
axs[aXn].plot(xs, y_r, label='$R$') axs[aXn].plot(xs, y_r, label='$R$')
axs[aXn].legend()
axs[aXn].grid() axs[aXn].grid()
axs[aXn].set_ylabel('Recall') axs[aXn].set_ylabel('Recall')
axs[aXn].set_ylim(0, 1) axs[aXn].set_ylim(0, 1)
@ -52,7 +53,7 @@ class eDiscoveryPlot:
axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$') axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
axs[aXn].legend() axs[aXn].legend()
axs[aXn].grid() axs[aXn].grid()
axs[aXn].set_ylabel('Prevalence') axs[aXn].set_ylabel('Pool prevalence')
aXn += 1 aXn += 1
y_ae = df['AE'] y_ae = df['AE']
@ -64,14 +65,28 @@ class eDiscoveryPlot:
axs[aXn].set_ylabel('Quantification error') axs[aXn].set_ylabel('Quantification error')
aXn += 1 aXn += 1
axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$') # classifier performance (not very reliable)
axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$') #axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
#axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
#axs[aXn].legend()
#axs[aXn].grid()
#axs[aXn].set_ylabel('Classifiers performance')
#aXn += 1
# distribution of posterior probabilities in the pool
positive_posteriors = posteriors[y==1,1]
negative_posteriors = posteriors[y==0,1]
#axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
# label=['negative', 'positive'])
axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
axs[aXn].legend() axs[aXn].legend()
axs[aXn].grid() axs[aXn].grid()
axs[aXn].set_ylabel('Classifiers performance') axs[aXn].set_xlim(0, 1)
axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
aXn += 1 aXn += 1
axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)') axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$') axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$') axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
axs[aXn].legend() axs[aXn].legend()
@ -79,6 +94,16 @@ class eDiscoveryPlot:
axs[aXn].set_ylabel('Train-Test Shift') axs[aXn].set_ylabel('Train-Test Shift')
aXn += 1 aXn += 1
for i in range(aXn):
if self.calls==0:
# Shrink current axis by 20%
box = axs[i].get_position()
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
fig.tight_layout()
# Put a legend to the right of the current axis
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
if save: if save:
os.makedirs(self.outdir, exist_ok=True) os.makedirs(self.outdir, exist_ok=True)
plt.savefig(f'{self.outdir}/{self.plotname}') plt.savefig(f'{self.outdir}/{self.plotname}')
@ -88,6 +113,8 @@ class eDiscoveryPlot:
for i in range(aXn): for i in range(aXn):
axs[i].cla() axs[i].cla()
self.calls += 1
if __name__ == '__main__': if __name__ == '__main__':