forked from moreo/QuaPy
Compare commits
18 Commits
master
...
ediscovery
|
@ -0,0 +1,34 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
#dataset=RCV1.C4
|
||||
#iter=50
|
||||
#k=100
|
||||
#initsize=1000
|
||||
#initprev=0.5
|
||||
#seed=1
|
||||
#
|
||||
#commons="--dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed"
|
||||
#
|
||||
#for Q in PCC ACC PACC EMQ HDy ; do
|
||||
# for sampling in relevance_sampling uncertainty_sampling mix_sampling ; do
|
||||
# PYTHONPATH='.:..' python3 main.py --quantifier $Q --sampling $sampling $commons
|
||||
# PYTHONPATH='.:..' python3 plot.py "./results/$dataset"_"$sampling"_"$Q.csv" 0
|
||||
# done
|
||||
#done
|
||||
|
||||
dataset=RCV1.C4
|
||||
iter=40
|
||||
k=100
|
||||
initsize=500
|
||||
initprev=-1
|
||||
seed=1
|
||||
Q=RPACC
|
||||
CLS=lr
|
||||
|
||||
for sampling in relevance_sampling uncertainty_sampling adaptive_sampling mix_sampling ; do
|
||||
|
||||
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
||||
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
|
||||
|
||||
done
|
|
@ -0,0 +1,190 @@
|
|||
import sys
|
||||
import sklearn
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
|
||||
import quapy as qp
|
||||
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
|
||||
ClassWeightPCC, PosteriorConditionalAdjustemnt
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
||||
import numpy as np
|
||||
from itertools import chain
|
||||
import argparse
|
||||
|
||||
|
||||
|
||||
def NewClassifier(classifiername):
|
||||
if classifiername== 'lr':
|
||||
return LogisticRegression(class_weight='balanced')
|
||||
elif classifiername== 'svm':
|
||||
# return SVC(class_weight='balanced', probability=True, kernel='linear')
|
||||
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
|
||||
|
||||
|
||||
def NewQuantifier(quantifiername, classifiername):
|
||||
if quantifiername == 'EMQ':
|
||||
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||
# return EMQ(NewClassifier(classifier))
|
||||
if quantifiername == 'CC':
|
||||
return CC(NewClassifier(classifiername))
|
||||
if quantifiername == 'HDy':
|
||||
return HDy(NewClassifier(classifiername))
|
||||
if quantifiername == 'PCC':
|
||||
return PCC(NewClassifier(classifiername))
|
||||
if quantifiername == 'ACC':
|
||||
return ACC(NewClassifier(classifiername), val_split=0.4)
|
||||
if quantifiername == 'PACC':
|
||||
return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||
if quantifiername == 'CW':
|
||||
return ClassWeightPCC()
|
||||
if quantifiername == 'SRSQ': # supervised regions, then single-label quantification
|
||||
#q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||
#q = PACC(NewClassifier(classifiername), val_split=0.4)
|
||||
q = ACC(NewClassifier(classifiername))
|
||||
return RegionAdjustmentQ(q, k=4)
|
||||
if quantifiername == 'URBQ': # unsupervised regions, then binary quantifications
|
||||
def newQ():
|
||||
# return PACC(NewClassifier(classifiername), val_split=0.4)
|
||||
# return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
|
||||
return ClassWeightPCC()
|
||||
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
||||
if quantifiername == 'PCAD': # posterior-conditional adjustment
|
||||
return PosteriorConditionalAdjustemnt()
|
||||
|
||||
raise ValueError('unknown quantifier', quantifiername)
|
||||
|
||||
|
||||
def experiment_name(args:argparse.Namespace):
|
||||
return '__'.join([f'{k}:{getattr(args, k)}' for k in sorted(vars(args).keys())]) + '.csv'
|
||||
|
||||
|
||||
def split_from_index(collection: LabelledCollection, index: np.ndarray):
|
||||
in_index_set = set(index)
|
||||
out_index_set = set(range(len(collection))) - in_index_set
|
||||
out_index = np.asarray(sorted(out_index_set), dtype=int)
|
||||
return collection.sampling_from_index(index), collection.sampling_from_index(out_index)
|
||||
|
||||
|
||||
def move_documents(target: LabelledCollection, origin: LabelledCollection, idx_origin: np.ndarray):
|
||||
# moves documents (indexed by idx_origin) from origin to target
|
||||
selected, reduced_origin = split_from_index(origin, idx_origin)
|
||||
enhanced_target = target + selected
|
||||
return enhanced_target, reduced_origin
|
||||
|
||||
|
||||
def uniform_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
||||
return np.random.choice(len(pool), k, replace=False)
|
||||
|
||||
|
||||
def proportional_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
||||
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
||||
return np.random.choice(len(pool), k, replace=False, p=prob/prob.sum())
|
||||
|
||||
|
||||
def relevance_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
||||
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
||||
top_relevant_idx = np.argsort(-prob)[:k]
|
||||
return top_relevant_idx
|
||||
|
||||
|
||||
def uncertainty_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
||||
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
||||
top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k]
|
||||
return top_uncertain_idx
|
||||
|
||||
|
||||
def mix_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
||||
relevance_idx = relevance_sampling(pool, classifier, k)
|
||||
uncertanty_idx = uncertainty_sampling(pool, classifier, k)
|
||||
interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
|
||||
_, unique_idx = np.unique(interleave_idx, return_index=True)
|
||||
top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
|
||||
return top_interleaved_idx
|
||||
|
||||
|
||||
def adaptive_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, progress: float):
|
||||
relevance_k = int(k*progress/100)
|
||||
uncertanty_k = k - relevance_k
|
||||
relevance_idx = relevance_sampling(pool, classifier, relevance_k)
|
||||
uncertainty_idx = uncertainty_sampling(pool, classifier, uncertanty_k)
|
||||
idx = np.concatenate([relevance_idx, uncertainty_idx])
|
||||
idx = np.unique(idx)
|
||||
return idx
|
||||
|
||||
|
||||
def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
|
||||
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
|
||||
top_relevant_idx = np.argsort(-prob)[:k]
|
||||
return top_relevant_idx
|
||||
|
||||
|
||||
def recall(train_prev, pool_prev, train_size, pool_size):
|
||||
frac_tr_pos = train_prev[1]
|
||||
frac_te_pos = pool_prev[1]
|
||||
recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
|
||||
return recall
|
||||
|
||||
|
||||
def create_dataset(datasetname):
|
||||
if datasetname == 'imdb.10K.75p':
|
||||
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
|
||||
collection = data.training.sampling(10000, 0.75)
|
||||
return collection
|
||||
|
||||
elif datasetname == 'RCV1.C4':
|
||||
X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
|
||||
y = y.toarray()
|
||||
prev = y.mean(axis=0).flatten()
|
||||
# choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
|
||||
# this category happens to be the cat with id 4
|
||||
target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0]
|
||||
print('chosen cat', target_cat)
|
||||
y = y[:, target_cat].flatten()
|
||||
return LabelledCollection(X, y)
|
||||
|
||||
elif datasetname == 'hp':
|
||||
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=5)
|
||||
collection = data.training + data.test
|
||||
collection = LabelledCollection(instances=collection.instances, labels=1-collection.labels)
|
||||
return collection
|
||||
|
||||
print(f'unknown dataset {datasetname}. Abort')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
|
||||
q = CC(NewClassifier(classifiername)).fit(train)
|
||||
return q.quantify(pool.instances), q.learner
|
||||
|
||||
|
||||
def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
||||
q = NewQuantifier(quantifiername, classifiername)
|
||||
# q._find_regions((train+pool).instances)
|
||||
q.fit(train)
|
||||
|
||||
prev = q.quantify(pool.instances)
|
||||
return prev, q
|
||||
|
||||
|
||||
def eval_classifier(learner, test:LabelledCollection):
|
||||
predictions = learner.predict(test.instances)
|
||||
true_labels = test.labels
|
||||
f1 = f1_score(true_labels, predictions, average='binary')
|
||||
return f1
|
||||
|
||||
|
||||
def ideal_cost(classifier, pool):
|
||||
# returns the cost (in terms of number of documents) to review until the last relevant document
|
||||
# is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
|
||||
# one assumes to know the optimal stopping point (reached after the last relevant is encountered)
|
||||
|
||||
prob = classifier.predict_proba(pool.instances)
|
||||
order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance"
|
||||
ranked_labels = pool.labels[order]
|
||||
num_relevant = np.sum(pool.labels)
|
||||
idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min()
|
||||
return idealized_cost
|
|
@ -0,0 +1,154 @@
|
|||
import os.path
|
||||
import pathlib
|
||||
|
||||
from sklearn.metrics import f1_score
|
||||
import functions as fn
|
||||
import quapy as qp
|
||||
import argparse
|
||||
from quapy.data import LabelledCollection
|
||||
from plot import eDiscoveryPlot, InOutDistPlot
|
||||
|
||||
|
||||
def main(args):
|
||||
|
||||
datasetname = args.dataset
|
||||
k = args.k
|
||||
init_nD = args.initsize
|
||||
sampling_fn = getattr(fn, args.sampling)
|
||||
max_iterations = args.iter
|
||||
clf_name = args.classifier
|
||||
q_name = args.quantifier
|
||||
|
||||
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', fn.create_dataset, datasetname)
|
||||
nD = len(collection)
|
||||
|
||||
# fig = eDiscoveryPlot(args.output)
|
||||
fig_dist = InOutDistPlot()
|
||||
|
||||
skip_first_steps = 1
|
||||
|
||||
with qp.util.temp_seed(args.seed):
|
||||
# initial labelled data selection
|
||||
if args.initprev == -1:
|
||||
idx = collection.sampling_index(init_nD)
|
||||
else:
|
||||
idx = collection.sampling_index(init_nD, *[1 - args.initprev, args.initprev])
|
||||
train, pool = fn.split_from_index(collection, idx)
|
||||
|
||||
# recall_target = 0.99
|
||||
i = 0
|
||||
|
||||
with open(args.output, 'wt') as foo:
|
||||
def tee(msg):
|
||||
foo.write(msg + '\n')
|
||||
foo.flush()
|
||||
print(msg)
|
||||
|
||||
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC'
|
||||
'\tMF1_Q\tMF1_Clf\tICost\tremaining\tba-prev\tba-estim')
|
||||
|
||||
batch_prev_estim, batch_prev_true, q = 0, 0, None
|
||||
|
||||
while True:
|
||||
|
||||
pool_p_hat_cc, classifier = fn.estimate_prev_CC(train, pool, clf_name)
|
||||
ideal_cost = fn.ideal_cost(classifier, pool)
|
||||
|
||||
nDtr = len(train)
|
||||
nDte = len(pool)
|
||||
progress = 100 * nDtr / nD
|
||||
|
||||
if i >= skip_first_steps:
|
||||
pool_p_hat_q, q = fn.estimate_prev_Q(train, pool, q_name, clf_name)
|
||||
|
||||
f1_clf = 0 # eval_classifier(classifier, pool)
|
||||
f1_q = 0 #eval_classifier(q_classifier, pool)
|
||||
|
||||
tr_p = train.prevalence()
|
||||
te_p = pool.prevalence()
|
||||
|
||||
# this is based on an observation by D.Lewis "it is convenient to have the same kind of systematic"
|
||||
# error both in the numerator and in the denominator
|
||||
#tr_p_hat = q.quantify(train.instances)
|
||||
#r_hat_q = fn.recall(tr_p_hat, pool_p_hat_q, nDtr, nDte)
|
||||
|
||||
r_hat_cc = fn.recall(tr_p, pool_p_hat_cc, nDtr, nDte)
|
||||
r_hat_q = fn.recall(tr_p, pool_p_hat_q, nDtr, nDte)
|
||||
r = fn.recall(tr_p, te_p, nDtr, nDte)
|
||||
tr_te_shift = qp.error.ae(tr_p, te_p)
|
||||
|
||||
ae_q = qp.error.ae(te_p, pool_p_hat_q)
|
||||
ae_cc = qp.error.ae(te_p, pool_p_hat_cc)
|
||||
|
||||
tee(f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat_q[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||
f'\t{r:.3f}\t{r_hat_q:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{ae_q:.4f}\t{ae_cc:.4f}\t{f1_q:.3f}\t{f1_clf:.3f}'
|
||||
f'\t{ideal_cost}\t{pool.labels.sum()}\t{batch_prev_true}\t{batch_prev_estim:.3f}')
|
||||
|
||||
posteriors = classifier.predict_proba(pool.instances)
|
||||
in_posteriors = classifier.predict_proba(train.instances)
|
||||
# fig.plot(posteriors, pool.labels)
|
||||
fig_dist.plot(in_posteriors, train.labels, posteriors, pool.labels)
|
||||
|
||||
if nDte < k:
|
||||
print('[stop] too few documents remaining')
|
||||
break
|
||||
elif i+1 == max_iterations:
|
||||
print('[stop] maximum number of iterations reached')
|
||||
break
|
||||
|
||||
top_relevant_idx = sampling_fn(pool, classifier, k, progress)
|
||||
|
||||
if q is not None:
|
||||
batch = pool.sampling_from_index(top_relevant_idx)
|
||||
batch_prev_estim = q.quantify(batch.instances)[1]
|
||||
batch_prev_true = batch.prevalence()[1]
|
||||
|
||||
train, pool = fn.move_documents(train, pool, top_relevant_idx)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='e-Discovery')
|
||||
parser.add_argument('--dataset', metavar='DATASET', type=str, help='Dataset name',
|
||||
default='RCV1.C4')
|
||||
parser.add_argument('--quantifier', metavar='METHOD', type=str, help='Quantification method',
|
||||
default='EMQ')
|
||||
parser.add_argument('--sampling', metavar='SAMPLING', type=str, help='Sampling criterion',
|
||||
default='relevance_sampling')
|
||||
parser.add_argument('--iter', metavar='INT', type=int, help='number of iterations (-1 to set no limit)',
|
||||
default=0.5)
|
||||
parser.add_argument('--k', metavar='BATCH', type=int, help='number of documents in a batch',
|
||||
default=100)
|
||||
parser.add_argument('--initsize', metavar='SIZE', type=int, help='number of labelled documents at the beginning',
|
||||
default=10)
|
||||
parser.add_argument('--initprev', metavar='PREV', type=float,
|
||||
help='prevalence of the initial sample (-1 for uniform sampling)',
|
||||
default=-1)
|
||||
parser.add_argument('--seed', metavar='SEED', type=int,
|
||||
help='random seed',
|
||||
default=1)
|
||||
parser.add_argument('--classifier', metavar='CLS', type=str,
|
||||
help='classifier type (svm, lr)',
|
||||
default='lr')
|
||||
parser.add_argument('--output', metavar='OUT', type=str,
|
||||
help="name of the file containing the results of the experiment (default is an automatic "
|
||||
"filename based on the model's parameters in the folder './results/')",
|
||||
default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.initprev==-1.0 or (0 < args.initprev < 1), 'wrong value for initsize; should be in (0., 1.)'
|
||||
if args.initprev==-1: # this is to clean the path, to show initprev:-1 and not initprev:-1.0
|
||||
args.initprev = int(args.initprev)
|
||||
if args.output is None:
|
||||
outputdir = './results'
|
||||
args.output = os.path.join(outputdir, fn.experiment_name(args))
|
||||
else:
|
||||
outputdir = pathlib.Path(args.output).parent.name
|
||||
if outputdir:
|
||||
qp.util.create_if_not_exist(outputdir)
|
||||
|
||||
for k,v in args.__dict__.items():
|
||||
print(f'{k}={v}')
|
||||
|
||||
main(args)
|
|
@ -0,0 +1,412 @@
|
|||
from typing import Union
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, clone
|
||||
from sklearn.cluster import KMeans, OPTICS
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.mixture import GaussianMixture
|
||||
from sklearn.model_selection import cross_val_predict
|
||||
|
||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import ACC, PACC, PCC
|
||||
import quapy.functional as F
|
||||
|
||||
|
||||
class RegionAdjustmentQ(BaseQuantifier):
|
||||
|
||||
def __init__(self, quantifier: BaseQuantifier, k=10):
|
||||
self.quantifier = quantifier
|
||||
self.k = k # number of regions
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
X, y = data.Xy
|
||||
Xp, Xn = X[y==1], X[y==0]
|
||||
|
||||
nk_per_class = (data.prevalence() * self.k).round().astype(int)
|
||||
print(f'number of regions per class {nk_per_class}')
|
||||
|
||||
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
|
||||
rn = kmeans_neg.fit_predict(Xn) # regions negative
|
||||
|
||||
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
|
||||
rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive
|
||||
|
||||
classes = np.arange(self.k)
|
||||
pos = LabelledCollection(Xp, rp, classes_=classes)
|
||||
neg = LabelledCollection(Xn, rn, classes_=classes)
|
||||
|
||||
region_data = pos + neg
|
||||
self.quantifier.fit(region_data)
|
||||
|
||||
self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
|
||||
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
region_prevalence = self.quantifier.quantify(instances)
|
||||
bin_prevalence = np.zeros(shape=2, dtype=np.float)
|
||||
for r, prev in enumerate(region_prevalence):
|
||||
bin_prevalence[self.reg2class[r]] += prev
|
||||
return bin_prevalence
|
||||
|
||||
def set_params(self, **parameters):
|
||||
pass
|
||||
|
||||
def get_params(self, deep=True):
|
||||
pass
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.asarray([0,1])
|
||||
|
||||
|
||||
class RegionAdjustment(ACC):
|
||||
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4, k=2):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
# lets say k is the number of regions (here: clusters of k-means) for each class
|
||||
self.k = k
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
X, y = data.Xy
|
||||
Xp, Xn = X[y==1], X[y==0]
|
||||
|
||||
nk_per_class = (data.prevalence() * self.k).round().astype(int)
|
||||
print(f'number of clusters per class {nk_per_class}')
|
||||
|
||||
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
|
||||
rn = kmeans_neg.fit_predict(Xn) # regions negative
|
||||
|
||||
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
|
||||
rp = kmeans_pos.fit_predict(Xp) + nk_per_class[0] # regions positive
|
||||
|
||||
classes = np.arange(self.k)
|
||||
pos = LabelledCollection(Xp, rp, classes_=classes)
|
||||
neg = LabelledCollection(Xn, rn, classes_=classes)
|
||||
|
||||
region_data = pos + neg
|
||||
super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split)
|
||||
|
||||
self.reg2class = {r: (0 if r < nk_per_class[0] else 1) for r in range(2 * self.k)}
|
||||
|
||||
return self
|
||||
|
||||
def classify(self, data):
|
||||
regions = super(RegionAdjustment, self).classify(data)
|
||||
return regions
|
||||
|
||||
def aggregate(self, classif_predictions):
|
||||
region_prevalence = super(RegionAdjustment, self).aggregate(classif_predictions)
|
||||
bin_prevalence = np.zeros(shape=2, dtype=np.float)
|
||||
for r, prev in enumerate(region_prevalence):
|
||||
bin_prevalence[self.reg2class[r]] += prev
|
||||
return bin_prevalence
|
||||
|
||||
|
||||
class RegionProbAdjustment(PACC):
|
||||
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4, k=2):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
# lets say k is the number of regions (here: clusters of k-means) for all classes
|
||||
self.k = k
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
X, y = data.Xy
|
||||
Xp, Xn = X[y==1], X[y==0]
|
||||
nk_per_class = (data.prevalence()*self.k).round().astype(int)
|
||||
print(f'number of clusters per class {nk_per_class}')
|
||||
|
||||
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
|
||||
rn = kmeans_neg.fit_predict(Xn) # regions negative
|
||||
|
||||
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
|
||||
rp = kmeans_pos.fit_predict(Xp)+nk_per_class[0] # regions positive
|
||||
|
||||
classes = np.arange(self.k)
|
||||
pos = LabelledCollection(Xp, rp, classes_=classes)
|
||||
neg = LabelledCollection(Xn, rn, classes_=classes)
|
||||
|
||||
region_data = pos + neg
|
||||
super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split)
|
||||
|
||||
self.reg2class = {r:(0 if r < nk_per_class[0] else 1) for r in range(2*self.k)}
|
||||
|
||||
return self
|
||||
|
||||
def classify(self, data):
|
||||
regions = super(RegionProbAdjustment, self).classify(data)
|
||||
return regions
|
||||
|
||||
def aggregate(self, classif_predictions):
|
||||
region_prevalence = super(RegionProbAdjustment, self).aggregate(classif_predictions)
|
||||
bin_prevalence = np.zeros(shape=2, dtype=np.float)
|
||||
for r, prev in enumerate(region_prevalence):
|
||||
bin_prevalence[self.reg2class[r]] += prev
|
||||
return bin_prevalence
|
||||
|
||||
|
||||
class RegionProbAdjustmentGlobal(BaseQuantifier):
|
||||
|
||||
def __init__(self, quantifier_fn: BaseQuantifier, k=5, clustering='gmm'):
|
||||
self.quantifier_fn = quantifier_fn
|
||||
self.k = k
|
||||
self.clustering = clustering
|
||||
|
||||
def _find_regions(self, X):
|
||||
if self.clustering == 'gmm':
|
||||
self.svd = TruncatedSVD(n_components=500)
|
||||
X = self.svd.fit_transform(X)
|
||||
|
||||
lowest_bic = np.infty
|
||||
bic = []
|
||||
for n_components in range(3, 8):
|
||||
# Fit a Gaussian mixture with EM
|
||||
gmm = GaussianMixture(n_components).fit(X)
|
||||
bic.append(gmm.bic(X))
|
||||
print(bic)
|
||||
if bic[-1] < lowest_bic:
|
||||
lowest_bic = bic[-1]
|
||||
best_gmm = gmm
|
||||
print(f'choosen GMM with {len(best_gmm.weights_)} components')
|
||||
self.cluster = best_gmm
|
||||
regions = self.cluster.predict(X)
|
||||
elif self.clustering == 'kmeans':
|
||||
print(f'kmeans with k={self.k}')
|
||||
self.cluster = KMeans(n_clusters=self.k)
|
||||
regions = self.cluster.fit_predict(X)
|
||||
elif self.clustering == 'optics':
|
||||
print('optics')
|
||||
self.svd = TruncatedSVD(n_components=500)
|
||||
X = self.svd.fit_transform(X)
|
||||
self.cluster = OPTICS()
|
||||
regions = self.cluster.fit_predict(X)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return regions
|
||||
|
||||
def _get_regions(self, X):
|
||||
if self.clustering == 'gmm':
|
||||
return self.cluster.predict(self.svd.transform(X))
|
||||
elif self.clustering == 'kmeans':
|
||||
return self.cluster.predict(X)
|
||||
elif self.clustering == 'optics':
|
||||
return self.cluster.predict(self.svd.transform(X))
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
self.classes = data.classes_
|
||||
|
||||
# first k-means (all classes involved), then PACC local to each cluster
|
||||
g = self._find_regions(data.instances)
|
||||
# g = self._get_regions(data.instances)
|
||||
X, y = data.Xy
|
||||
self.g_quantifiers = {}
|
||||
trivial, trivial_data = 0, 0
|
||||
for gi in np.unique(g):
|
||||
qi_data = LabelledCollection(X[g==gi], y[g==gi], classes_=data.classes_)
|
||||
if qi_data.counts()[1] <= 1:
|
||||
# check for <= 1 instead of prevalence==0, since PACC requires at least two
|
||||
# examples for performing stratified split
|
||||
# some class is (almost) empty
|
||||
# if qi_data.prevalence()[0] == 1: # all negatives
|
||||
self.g_quantifiers[gi] = TrivialRejectorQuantifier()
|
||||
trivial+=1
|
||||
trivial_data += len(qi_data)
|
||||
elif qi_data.counts()[0] <= 1: # (almost) all positives
|
||||
self.g_quantifiers[gi] = TrivialAcceptorQuantifier()
|
||||
trivial += 1
|
||||
trivial_data += len(qi_data)
|
||||
else:
|
||||
self.g_quantifiers[gi] = self.quantifier_fn().fit(qi_data)
|
||||
print(f'trivials={trivial} amounting to {trivial_data*100.0/len(data):.2f}% of the data')
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.classes
|
||||
|
||||
def quantify(self, instances):
|
||||
# g = self.cluster.predict(instances)
|
||||
g = self._get_regions(instances)
|
||||
prevalence = np.zeros(len(self.classes_), dtype=np.float)
|
||||
for gi in np.unique(g):
|
||||
proportion_gi = (g==gi).mean()
|
||||
prev_gi = self.g_quantifiers[gi].quantify(instances[g==gi])
|
||||
prevalence += prev_gi * proportion_gi
|
||||
return prevalence
|
||||
|
||||
|
||||
def get_params(self, deep=True):
|
||||
pass
|
||||
|
||||
def set_params(self, **parameters):
|
||||
pass
|
||||
|
||||
|
||||
class TrivialRejectorQuantifier(BinaryQuantifier):
|
||||
def fit(self, data: LabelledCollection):
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
return np.asarray([1,0])
|
||||
|
||||
def set_params(self, **parameters):
|
||||
pass
|
||||
|
||||
def get_params(self, deep=True):
|
||||
pass
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.asarray([0,1])
|
||||
|
||||
|
||||
class TrivialAcceptorQuantifier(BinaryQuantifier):
|
||||
def fit(self, data: LabelledCollection):
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
return np.asarray([0,1])
|
||||
|
||||
def set_params(self, **parameters):
|
||||
pass
|
||||
|
||||
def get_params(self, deep=True):
|
||||
pass
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.asarray([0,1])
|
||||
|
||||
|
||||
class ClassWeightPCC(BaseQuantifier):
|
||||
|
||||
def __init__(self, estimator=LogisticRegression):
|
||||
self.estimator = estimator
|
||||
self.learner = PACC(self.estimator())
|
||||
self.deployed = False
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
self.train = data
|
||||
self.learner.fit(self.train)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
guessed_prevalence = self.learner.quantify(instances)
|
||||
class_weight = self._get_class_weight(guessed_prevalence)
|
||||
base_estimator = clone(self.learner.learner)
|
||||
base_estimator.set_params(class_weight=class_weight)
|
||||
pcc = PCC(base_estimator)
|
||||
return pcc.fit(self.train).quantify(instances)
|
||||
|
||||
def _get_class_weight(self, prevalence):
|
||||
# class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence))
|
||||
# return {0: class_weight[1], 1: class_weight[0]}
|
||||
# weights = prevalence/prevalence.min()
|
||||
weights = prevalence / self.train.prevalence()
|
||||
normfactor = weights.min()
|
||||
if normfactor <= 0:
|
||||
normfactor = 1E-3
|
||||
weights /= normfactor
|
||||
return {0:weights[0], 1:weights[1]}
|
||||
|
||||
def set_params(self, **parameters):
|
||||
# parameters = {p:v for p,v in parameters.items()}
|
||||
# print(parameters)
|
||||
self.learner.set_params(**parameters)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.learner.get_params()
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.train.classes_
|
||||
|
||||
|
||||
class PosteriorConditionalAdjustemnt(BaseQuantifier):
|
||||
|
||||
def __init__(self):
|
||||
self.estimator = LogisticRegression()
|
||||
self.k = 3
|
||||
|
||||
def get_adjustment_matrix(self, y, prob):
|
||||
n_classes = 2
|
||||
classes = [0, 1]
|
||||
confusion = np.empty(shape=(n_classes, n_classes))
|
||||
for i, class_ in enumerate(classes):
|
||||
index = y == class_
|
||||
if any(index):
|
||||
confusion[i] = prob[index].mean(axis=0)
|
||||
else:
|
||||
if i == 0:
|
||||
confusion[i] = np.asarray([1,0])
|
||||
else:
|
||||
confusion[i] = np.asarray([0, 1])
|
||||
|
||||
confusion = confusion.T
|
||||
return confusion
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
X, y = data.Xy
|
||||
proba = cross_val_predict(self.estimator, X, y, n_jobs=-1, method='predict_proba')
|
||||
|
||||
order = np.argsort(proba[:,1])
|
||||
proba = proba[order]
|
||||
y = y[order]
|
||||
X = X[order] # to keep the alignment for the final classifier
|
||||
n = len(data)
|
||||
bucket_size = n // self.k
|
||||
bucket_remainder = n % bucket_size
|
||||
self.buckets = {}
|
||||
self.prob_separations = []
|
||||
for bucket in range(self.k):
|
||||
from_pos = bucket*bucket_size
|
||||
to_pos = (bucket+1)*bucket_size + (bucket_remainder if bucket==self.k-1 else 0)
|
||||
slice_b = slice(from_pos, to_pos)
|
||||
y_b = y[slice_b]
|
||||
proba_b = proba[slice_b]
|
||||
self.buckets[bucket] = self.get_adjustment_matrix(y_b, proba_b)
|
||||
self.prob_separations.append(proba_b[-1,1])
|
||||
self.prob_separations[-1] = 1 # the last one should account for the entire prob
|
||||
|
||||
self.estimator.fit(X,y)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
proba = self.estimator.predict_proba(instances)
|
||||
#proba = sorted(proba, key=lambda p:p[1])
|
||||
|
||||
prev = np.zeros(shape=2, dtype=np.float)
|
||||
n = proba.shape[0]
|
||||
last_prob_sep = 0
|
||||
for b, prob_sep in enumerate(self.prob_separations):
|
||||
proba_b = proba[np.logical_and(proba[:,1] >= last_prob_sep, proba[:,1] < prob_sep)]
|
||||
last_prob_sep=prob_sep
|
||||
if proba_b.size > 0:
|
||||
pcc_b = F.prevalence_from_probabilities(proba_b, binarize=False)
|
||||
adj_matrix = self.buckets[b]
|
||||
pacc_b = ACC.solve_adjustment(adj_matrix, pcc_b)
|
||||
bucket_prev = proba_b.shape[0] / n
|
||||
print(f'bucket {b} -> {F.strprev(pacc_b)} with prop {bucket_prev:.4f}')
|
||||
prev += (pacc_b*bucket_prev)
|
||||
|
||||
print(F.strprev(prev))
|
||||
return prev
|
||||
|
||||
def set_params(self, **parameters):
|
||||
# parameters = {p:v for p,v in parameters.items()}
|
||||
# print(parameters)
|
||||
self.learner.set_params(**parameters)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.learner.get_params()
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.train.classes_
|
|
@ -0,0 +1,211 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import sys, os, pathlib
|
||||
|
||||
|
||||
class eDiscoveryPlot:
|
||||
|
||||
def __init__(self, datapath, outdir='./plots', loop=True, save=True, showYdist=False, showCost=True, refreshEach=10):
|
||||
self.outdir = outdir
|
||||
self.datapath = datapath
|
||||
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
|
||||
self.loop = loop
|
||||
self.save = save
|
||||
self.showYdist = showYdist
|
||||
self.showCost = showCost
|
||||
self.refreshEach = refreshEach
|
||||
|
||||
nPlots = 4
|
||||
if showYdist:
|
||||
nPlots+=1
|
||||
if showCost:
|
||||
nPlots += 1
|
||||
|
||||
if not loop:
|
||||
plt.rcParams['figure.figsize'] = [12, 12]
|
||||
plt.rcParams['figure.dpi'] = 200
|
||||
else:
|
||||
plt.rcParams['figure.figsize'] = [14, 18]
|
||||
plt.rcParams['figure.dpi'] = 50
|
||||
plt.rcParams.update({'font.size': 15})
|
||||
|
||||
# plot the data
|
||||
self.fig, self.axs = plt.subplots(nPlots)
|
||||
self.calls=0
|
||||
|
||||
def plot(self):
|
||||
|
||||
# if (self.calls+1) % self.refreshEach != 0:
|
||||
# self.calls+=1
|
||||
# return
|
||||
|
||||
fig, axs = self.fig, self.axs
|
||||
loop, save = self.loop, self.save
|
||||
|
||||
aXn = 0
|
||||
df = pd.read_csv(self.datapath, sep='\t')
|
||||
|
||||
xs = df['it']
|
||||
|
||||
y_r = df['R']
|
||||
y_rhat = df['Rhat']
|
||||
y_rhatCC = df['RhatCC']
|
||||
axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
|
||||
axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
|
||||
axs[aXn].plot(xs, y_r, label='$R$')
|
||||
axs[aXn].grid()
|
||||
axs[aXn].set_ylabel('Recall')
|
||||
axs[aXn].set_ylim(0, 1)
|
||||
aXn += 1
|
||||
|
||||
y_r = df['te-prev']
|
||||
y_rhat = df['te-estim']
|
||||
y_rhatCC = df['te-estimCC']
|
||||
axs[aXn].plot(xs, y_rhat, label='te-$\hat{Pr}(\oplus)_{Q}$')
|
||||
axs[aXn].plot(xs, y_rhatCC, label='te-$\hat{Pr}(\oplus)_{CC}$')
|
||||
axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
|
||||
axs[aXn].legend()
|
||||
axs[aXn].grid()
|
||||
axs[aXn].set_ylabel('Pool prevalence')
|
||||
aXn += 1
|
||||
|
||||
y_ae = df['AE']
|
||||
y_ae_cc = df['AE_CC']
|
||||
axs[aXn].plot(xs, y_ae, label='AE$_{Q}$')
|
||||
axs[aXn].plot(xs, y_ae_cc, label='AE$_{CC}$')
|
||||
axs[aXn].legend()
|
||||
axs[aXn].grid()
|
||||
axs[aXn].set_ylabel('Quantification error')
|
||||
aXn += 1
|
||||
|
||||
# classifier performance (not very reliable)
|
||||
#axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
|
||||
#axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
|
||||
#axs[aXn].legend()
|
||||
#axs[aXn].grid()
|
||||
#axs[aXn].set_ylabel('Classifiers performance')
|
||||
#aXn += 1
|
||||
|
||||
if self.showCost:
|
||||
cost = df['tr-size']
|
||||
idealcost = df['ICost']
|
||||
totalcost = cost + idealcost
|
||||
axs[aXn].plot(xs, cost, label='Cost')
|
||||
axs[aXn].plot(xs, idealcost, label='IdealCost')
|
||||
axs[aXn].plot(xs, totalcost, label='TotalCost')
|
||||
axs[aXn].legend()
|
||||
axs[aXn].grid()
|
||||
axs[aXn].set_ylabel('Cost')
|
||||
aXn += 1
|
||||
|
||||
# distribution of posterior probabilities in the pool
|
||||
# if self.showYdist:
|
||||
# positive_posteriors = posteriors[y==1,1]
|
||||
# negative_posteriors = posteriors[y==0,1]
|
||||
# axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
|
||||
# axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
|
||||
# axs[aXn].legend()
|
||||
# axs[aXn].grid()
|
||||
# axs[aXn].set_xlim(0, 1)
|
||||
# axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
|
||||
# aXn += 1
|
||||
|
||||
axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
|
||||
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
|
||||
axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
|
||||
axs[aXn].legend()
|
||||
axs[aXn].grid()
|
||||
axs[aXn].set_ylabel('Train-Test Shift')
|
||||
aXn += 1
|
||||
|
||||
for i in range(aXn):
|
||||
if self.calls==0:
|
||||
# Shrink current axis by 20%
|
||||
box = axs[i].get_position()
|
||||
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||
fig.tight_layout()
|
||||
|
||||
# Put a legend to the right of the current axis
|
||||
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||
|
||||
# if save:
|
||||
# os.makedirs(self.outdir, exist_ok=True)
|
||||
# plt.savefig(f'{self.outdir}/{self.plotname}')
|
||||
|
||||
if loop:
|
||||
plt.pause(.5)
|
||||
for i in range(aXn):
|
||||
axs[i].cla()
|
||||
|
||||
self.calls += 1
|
||||
|
||||
|
||||
class InOutDistPlot:
|
||||
|
||||
def __init__(self, refreshEach=1):
|
||||
self.refreshEach = refreshEach
|
||||
|
||||
# plot the data
|
||||
self.fig, self.axs = plt.subplots(2)
|
||||
self.calls = 0
|
||||
|
||||
def _plot_dist(self, posteriors, y, aXn, title):
|
||||
positive_posteriors = posteriors[y == 1, 1]
|
||||
negative_posteriors = posteriors[y == 0, 1]
|
||||
self.axs[aXn].hist(negative_posteriors, bins=50, label='$Pr(x|\ominus)$', density=False, alpha=.75)
|
||||
self.axs[aXn].hist(positive_posteriors, bins=50, label='$Pr(x|\oplus)$', density=False, alpha=.75)
|
||||
self.axs[aXn].legend()
|
||||
self.axs[aXn].grid()
|
||||
self.axs[aXn].set_xlim(0, 1)
|
||||
self.axs[aXn].set_ylabel(title)
|
||||
|
||||
def plot(self, in_posteriors, in_y, out_posteriors, out_y):
|
||||
|
||||
if (self.calls+1) % self.refreshEach != 0:
|
||||
self.calls += 1
|
||||
return
|
||||
|
||||
fig, axs = self.fig, self.axs
|
||||
|
||||
aXn = 0
|
||||
|
||||
# in-posteriors distribution
|
||||
self._plot_dist(in_posteriors, in_y, aXn, title='training distribution')
|
||||
aXn += 1
|
||||
|
||||
# out-posteriors distribution
|
||||
self._plot_dist(out_posteriors, out_y, aXn, title='pool distribution')
|
||||
aXn += 1
|
||||
|
||||
for i in range(aXn):
|
||||
if self.calls==0:
|
||||
# Shrink current axis by 20%
|
||||
box = axs[i].get_position()
|
||||
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||
fig.tight_layout()
|
||||
|
||||
# Put a legend to the right of the current axis
|
||||
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||
|
||||
plt.pause(.5)
|
||||
for i in range(aXn):
|
||||
axs[i].cla()
|
||||
|
||||
self.calls += 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
||||
|
||||
file = str(sys.argv[1])
|
||||
loop = bool(int(sys.argv[2]))
|
||||
|
||||
figure = eDiscoveryPlot(file)
|
||||
|
||||
try:
|
||||
while True:
|
||||
figure.plot()
|
||||
except KeyboardInterrupt:
|
||||
print('\n[stop]')
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
dataset=RCV1.C4
|
||||
iter=100
|
||||
k=100
|
||||
initsize=500
|
||||
initprev=-1
|
||||
seed=1
|
||||
Q=URBQ
|
||||
CLS=lr
|
||||
sampling=relevance_sampling
|
||||
|
||||
filepath="./results/classifier:"$CLS"__dataset:"$dataset"__initprev:"$initprev"__initsize:"$initsize"__iter:"$iter"__k:"$k"__quantifier:"$Q"__sampling:"$sampling"__seed:"$seed".csv"
|
||||
|
||||
PYTHONPATH='.:..' python3 main.py --quantifier $Q --classifier $CLS --sampling $sampling --dataset $dataset --iter $iter --k $k --initsize $initsize --initprev $initprev --seed $seed
|
||||
#sleep 2
|
||||
#PYTHONPATH='.:..' python3 plot.py $filepath 1
|
|
@ -143,7 +143,7 @@ class LabelledCollection:
|
|||
else:
|
||||
raise NotImplementedError('unsupported operation for collection types')
|
||||
labels = np.concatenate([self.labels, other.labels])
|
||||
return LabelledCollection(join_instances, labels)
|
||||
return LabelledCollection(join_instances, labels, classes_=self.classes_)
|
||||
|
||||
@property
|
||||
def Xy(self):
|
||||
|
|
|
@ -200,9 +200,9 @@ class ACC(AggregativeQuantifier):
|
|||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
# pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
|
||||
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||
|
@ -226,6 +226,7 @@ class ACC(AggregativeQuantifier):
|
|||
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.conf_matrix_ = confusion_matrix(y, y_).T
|
||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
|
||||
|
||||
return self
|
||||
|
@ -289,9 +290,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
for k, (training_idx, validation_idx) in enumerate(kfcv.split(*data.Xy)):
|
||||
# pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = training_helper(
|
||||
|
|
Loading…
Reference in New Issue