forked from moreo/QuaPy
adding hist plot
This commit is contained in:
parent
4d4cf6eb3f
commit
6ea627449c
|
@ -6,7 +6,8 @@ from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import LinearSVC, SVC
|
from sklearn.svm import LinearSVC, SVC
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal
|
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
|
||||||
|
ClassWeightPCC
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -27,6 +28,8 @@ def NewQuantifier(quantifiername, classifiername):
|
||||||
if quantifiername == 'EMQ':
|
if quantifiername == 'EMQ':
|
||||||
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||||
# return EMQ(NewClassifier(classifier))
|
# return EMQ(NewClassifier(classifier))
|
||||||
|
if quantifiername == 'CC':
|
||||||
|
return CC(NewClassifier(classifiername))
|
||||||
if quantifiername == 'HDy':
|
if quantifiername == 'HDy':
|
||||||
return HDy(NewClassifier(classifiername))
|
return HDy(NewClassifier(classifiername))
|
||||||
if quantifiername == 'PCC':
|
if quantifiername == 'PCC':
|
||||||
|
@ -35,14 +38,18 @@ def NewQuantifier(quantifiername, classifiername):
|
||||||
return ACC(NewClassifier(classifiername), val_split=0.4)
|
return ACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
if quantifiername == 'PACC':
|
if quantifiername == 'PACC':
|
||||||
return PACC(NewClassifier(classifiername), val_split=0.4)
|
return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
if quantifiername == 'RACC':
|
if quantifiername == 'CW':
|
||||||
return RegionAdjustment(NewClassifier(classifiername), val_split=0.4)
|
return ClassWeightPCC()
|
||||||
if quantifiername == 'RPACC':
|
if quantifiername == 'SRSQ': # supervised regions, then single-label quantification
|
||||||
return RegionProbAdjustment(NewClassifier(classifiername), val_split=0.4, k=10)
|
#q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||||
if quantifiername == 'GRPACC':
|
#q = PACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
|
q = ACC(NewClassifier(classifiername))
|
||||||
|
return RegionAdjustmentQ(q, k=4)
|
||||||
|
if quantifiername == 'URBQ': # unsupervised regions, then binary quantifications
|
||||||
def newQ():
|
def newQ():
|
||||||
# return PACC(NewClassifier(classifiername), val_split=0.4)
|
# return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||||
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
# return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||||
|
return ClassWeightPCC()
|
||||||
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
||||||
raise ValueError('unknown quantifier', quantifiername)
|
raise ValueError('unknown quantifier', quantifiername)
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,8 @@ def main(args):
|
||||||
|
|
||||||
fig = eDiscoveryPlot(args.output)
|
fig = eDiscoveryPlot(args.output)
|
||||||
|
|
||||||
|
skip_first_steps = 0
|
||||||
|
|
||||||
with qp.util.temp_seed(args.seed):
|
with qp.util.temp_seed(args.seed):
|
||||||
# initial labelled data selection
|
# initial labelled data selection
|
||||||
if args.initprev == -1:
|
if args.initprev == -1:
|
||||||
|
@ -61,6 +63,12 @@ def main(args):
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
||||||
|
|
||||||
|
nDtr = len(train)
|
||||||
|
nDte = len(pool)
|
||||||
|
progress = 100 * nDtr / nD
|
||||||
|
|
||||||
|
if i >= skip_first_steps:
|
||||||
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
pool_p_hat_q, q_classifier = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
||||||
# q.fit(train)
|
# q.fit(train)
|
||||||
# pool_p_hat_q = q.quantify(pool.instances)
|
# pool_p_hat_q = q.quantify(pool.instances)
|
||||||
|
@ -71,23 +79,20 @@ def main(args):
|
||||||
|
|
||||||
tr_p = train.prevalence()
|
tr_p = train.prevalence()
|
||||||
te_p = pool.prevalence()
|
te_p = pool.prevalence()
|
||||||
nDtr = len(train)
|
|
||||||
nDte = len(pool)
|
|
||||||
|
|
||||||
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
|
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
|
||||||
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
|
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
|
||||||
r = fn.recall(tr_p, te_p, nDtr, nDte)
|
r = fn.recall(tr_p, te_p, nDtr, nDte)
|
||||||
tr_te_shift = qp.error.ae(tr_p, te_p)
|
tr_te_shift = qp.error.ae(tr_p, te_p)
|
||||||
|
|
||||||
progress = 100 * nDtr / nD
|
|
||||||
|
|
||||||
ae_q = qp.error.ae(te_p, pool_p_hat_q)
|
ae_q = qp.error.ae(te_p, pool_p_hat_q)
|
||||||
ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
|
ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
|
||||||
|
|
||||||
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||||
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
|
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}')
|
||||||
|
|
||||||
fig.plot()
|
posteriors = classifier.predict_proba(pool.instances)
|
||||||
|
fig.plot(posteriors, pool.labels)
|
||||||
|
|
||||||
if nDte < k:
|
if nDte < k:
|
||||||
print('[stop] too few documents remaining')
|
print('[stop] too few documents remaining')
|
||||||
|
@ -142,4 +147,7 @@ if __name__ == '__main__':
|
||||||
if outputdir:
|
if outputdir:
|
||||||
qp.util.create_if_not_exist(outputdir)
|
qp.util.create_if_not_exist(outputdir)
|
||||||
|
|
||||||
|
for k,v in args.__dict__.items():
|
||||||
|
print(f'{k}={v}')
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -3,10 +3,59 @@ import numpy as np
|
||||||
from sklearn.base import BaseEstimator, clone
|
from sklearn.base import BaseEstimator, clone
|
||||||
from sklearn.cluster import KMeans, OPTICS
|
from sklearn.cluster import KMeans, OPTICS
|
||||||
from sklearn.decomposition import TruncatedSVD
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.mixture import GaussianMixture
|
from sklearn.mixture import GaussianMixture
|
||||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import ACC, PACC
|
from quapy.method.aggregative import ACC, PACC, PCC
|
||||||
|
|
||||||
|
|
||||||
|
class RegionAdjustmentQ(BaseQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, quantifier: BaseQuantifier, k=10):
|
||||||
|
self.quantifier = quantifier
|
||||||
|
self.k = k # number of regions
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
X, y = data.Xy
|
||||||
|
Xp, Xn = X[y==1], X[y==0]
|
||||||
|
|
||||||
|
nk_per_class = (data.prevalence() * self.k).round().astype(int)
|
||||||
|
print(f'number of regions per class {nk_per_class}')
|
||||||
|
|
||||||
|
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
|
||||||
|
rn = kmeans_neg.fit_predict(Xn) # regions negative
|
||||||
|
|
||||||
|
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
|
||||||
|
rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive
|
||||||
|
|
||||||
|
classes = np.arange(self.k)
|
||||||
|
pos = LabelledCollection(Xp, rp, classes_=classes)
|
||||||
|
neg = LabelledCollection(Xn, rn, classes_=classes)
|
||||||
|
|
||||||
|
region_data = pos + neg
|
||||||
|
self.quantifier.fit(region_data)
|
||||||
|
|
||||||
|
self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
region_prevalence = self.quantifier.quantify(instances)
|
||||||
|
bin_prevalence = np.zeros(shape=2, dtype=np.float)
|
||||||
|
for r, prev in enumerate(region_prevalence):
|
||||||
|
bin_prevalence[self.reg2class[r]] += prev
|
||||||
|
return bin_prevalence
|
||||||
|
|
||||||
|
def set_params(self, **parameters):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return np.asarray([0,1])
|
||||||
|
|
||||||
|
|
||||||
class RegionAdjustment(ACC):
|
class RegionAdjustment(ACC):
|
||||||
|
@ -20,15 +69,25 @@ class RegionAdjustment(ACC):
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||||
X, y = data.Xy
|
X, y = data.Xy
|
||||||
Xp, Xn = X[y==1], X[y==0]
|
Xp, Xn = X[y==1], X[y==0]
|
||||||
kmeans = KMeans(n_clusters=self.k)
|
|
||||||
rn = kmeans.fit_predict(Xn) # regions negative
|
nk_per_class = (data.prevalence() * self.k).round().astype(int)
|
||||||
rp = kmeans.fit_predict(Xp)+self.k # regions positive
|
print(f'number of clusters per class {nk_per_class}')
|
||||||
classes = np.arange(self.k*2)
|
|
||||||
|
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
|
||||||
|
rn = kmeans_neg.fit_predict(Xn) # regions negative
|
||||||
|
|
||||||
|
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
|
||||||
|
rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive
|
||||||
|
|
||||||
|
classes = np.arange(self.k)
|
||||||
pos = LabelledCollection(Xp, rp, classes_=classes)
|
pos = LabelledCollection(Xp, rp, classes_=classes)
|
||||||
neg = LabelledCollection(Xn, rn, classes_=classes)
|
neg = LabelledCollection(Xn, rn, classes_=classes)
|
||||||
|
|
||||||
region_data = pos + neg
|
region_data = pos + neg
|
||||||
super(RegionAdjustment, self).fit(region_data, fit_learner, val_split)
|
super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split)
|
||||||
self.reg2class = {r:(0 if r < self.k else 1) for r in range(2*self.k)}
|
|
||||||
|
self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def classify(self, data):
|
def classify(self, data):
|
||||||
|
@ -219,3 +278,47 @@ class TrivialAcceptorQuantifier(BinaryQuantifier):
|
||||||
@property
|
@property
|
||||||
def classes_(self):
|
def classes_(self):
|
||||||
return np.asarray([0,1])
|
return np.asarray([0,1])
|
||||||
|
|
||||||
|
|
||||||
|
class ClassWeightPCC(BaseQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, estimator=LogisticRegression):
|
||||||
|
self.estimator = estimator
|
||||||
|
self.learner = PACC(self.estimator())
|
||||||
|
self.deployed = False
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
|
self.train = data
|
||||||
|
self.learner.fit(self.train)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
guessed_prevalence = self.learner.quantify(instances)
|
||||||
|
class_weight = self._get_class_weight(guessed_prevalence)
|
||||||
|
base_estimator = clone(self.learner.learner)
|
||||||
|
base_estimator.set_params(class_weight=class_weight)
|
||||||
|
pcc = PCC(base_estimator)
|
||||||
|
return pcc.fit(self.train).quantify(instances)
|
||||||
|
|
||||||
|
def _get_class_weight(self, prevalence):
|
||||||
|
# class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence))
|
||||||
|
# return {0: class_weight[1], 1: class_weight[0]}
|
||||||
|
# weights = prevalence/prevalence.min()
|
||||||
|
weights = prevalence / self.train.prevalence()
|
||||||
|
normfactor = weights.min()
|
||||||
|
if normfactor <= 0:
|
||||||
|
normfactor = 1E-3
|
||||||
|
weights /= normfactor
|
||||||
|
return {0:weights[0], 1:weights[1]}
|
||||||
|
|
||||||
|
def set_params(self, **parameters):
|
||||||
|
# parameters = {p:v for p,v in parameters.items()}
|
||||||
|
# print(parameters)
|
||||||
|
self.learner.set_params(**parameters)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.learner.get_params()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self.train.classes_
|
|
@ -16,14 +16,16 @@ class eDiscoveryPlot:
|
||||||
plt.rcParams['figure.figsize'] = [12, 12]
|
plt.rcParams['figure.figsize'] = [12, 12]
|
||||||
plt.rcParams['figure.dpi'] = 200
|
plt.rcParams['figure.dpi'] = 200
|
||||||
else:
|
else:
|
||||||
plt.rcParams['figure.figsize'] = [17, 17]
|
plt.rcParams['figure.figsize'] = [14, 18]
|
||||||
plt.rcParams['figure.dpi'] = 60
|
plt.rcParams['figure.dpi'] = 50
|
||||||
|
plt.rcParams.update({'font.size': 15})
|
||||||
|
|
||||||
# plot the data
|
# plot the data
|
||||||
self.fig, self.axs = plt.subplots(5)
|
self.fig, self.axs = plt.subplots(5)
|
||||||
|
self.calls=0
|
||||||
|
|
||||||
|
def plot(self, posteriors, y):
|
||||||
|
|
||||||
def plot(self):
|
|
||||||
fig, axs = self.fig, self.axs
|
fig, axs = self.fig, self.axs
|
||||||
loop, save = self.loop, self.save
|
loop, save = self.loop, self.save
|
||||||
|
|
||||||
|
@ -38,7 +40,6 @@ class eDiscoveryPlot:
|
||||||
axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
|
axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
|
||||||
axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
|
axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
|
||||||
axs[aXn].plot(xs, y_r, label='$R$')
|
axs[aXn].plot(xs, y_r, label='$R$')
|
||||||
axs[aXn].legend()
|
|
||||||
axs[aXn].grid()
|
axs[aXn].grid()
|
||||||
axs[aXn].set_ylabel('Recall')
|
axs[aXn].set_ylabel('Recall')
|
||||||
axs[aXn].set_ylim(0, 1)
|
axs[aXn].set_ylim(0, 1)
|
||||||
|
@ -52,7 +53,7 @@ class eDiscoveryPlot:
|
||||||
axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
|
axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
|
||||||
axs[aXn].legend()
|
axs[aXn].legend()
|
||||||
axs[aXn].grid()
|
axs[aXn].grid()
|
||||||
axs[aXn].set_ylabel('Prevalence')
|
axs[aXn].set_ylabel('Pool prevalence')
|
||||||
aXn += 1
|
aXn += 1
|
||||||
|
|
||||||
y_ae = df['AE']
|
y_ae = df['AE']
|
||||||
|
@ -64,14 +65,28 @@ class eDiscoveryPlot:
|
||||||
axs[aXn].set_ylabel('Quantification error')
|
axs[aXn].set_ylabel('Quantification error')
|
||||||
aXn += 1
|
aXn += 1
|
||||||
|
|
||||||
axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
|
# classifier performance (not very reliable)
|
||||||
axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
|
#axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
|
||||||
|
#axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
|
||||||
|
#axs[aXn].legend()
|
||||||
|
#axs[aXn].grid()
|
||||||
|
#axs[aXn].set_ylabel('Classifiers performance')
|
||||||
|
#aXn += 1
|
||||||
|
|
||||||
|
# distribution of posterior probabilities in the pool
|
||||||
|
positive_posteriors = posteriors[y==1,1]
|
||||||
|
negative_posteriors = posteriors[y==0,1]
|
||||||
|
#axs[aXn].hist([negative_posteriors, positive_posteriors], bins=50,
|
||||||
|
# label=['negative', 'positive'])
|
||||||
|
axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
|
||||||
|
axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
|
||||||
axs[aXn].legend()
|
axs[aXn].legend()
|
||||||
axs[aXn].grid()
|
axs[aXn].grid()
|
||||||
axs[aXn].set_ylabel('Classifiers performance')
|
axs[aXn].set_xlim(0, 1)
|
||||||
|
axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
|
||||||
aXn += 1
|
aXn += 1
|
||||||
|
|
||||||
axs[aXn].plot(xs, df['Shift'], '--k', label='tr-te shift (AE)')
|
axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
|
||||||
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
|
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
|
||||||
axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
|
axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
|
||||||
axs[aXn].legend()
|
axs[aXn].legend()
|
||||||
|
@ -79,6 +94,16 @@ class eDiscoveryPlot:
|
||||||
axs[aXn].set_ylabel('Train-Test Shift')
|
axs[aXn].set_ylabel('Train-Test Shift')
|
||||||
aXn += 1
|
aXn += 1
|
||||||
|
|
||||||
|
for i in range(aXn):
|
||||||
|
if self.calls==0:
|
||||||
|
# Shrink current axis by 20%
|
||||||
|
box = axs[i].get_position()
|
||||||
|
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||||
|
fig.tight_layout()
|
||||||
|
|
||||||
|
# Put a legend to the right of the current axis
|
||||||
|
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
if save:
|
if save:
|
||||||
os.makedirs(self.outdir, exist_ok=True)
|
os.makedirs(self.outdir, exist_ok=True)
|
||||||
plt.savefig(f'{self.outdir}/{self.plotname}')
|
plt.savefig(f'{self.outdir}/{self.plotname}')
|
||||||
|
@ -88,6 +113,8 @@ class eDiscoveryPlot:
|
||||||
for i in range(aXn):
|
for i in range(aXn):
|
||||||
axs[i].cla()
|
axs[i].cla()
|
||||||
|
|
||||||
|
self.calls += 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue