forked from moreo/QuaPy
190 lines
7.8 KiB
Python
190 lines
7.8 KiB
Python
import sys
|
|
import sklearn
|
|
from sklearn.base import BaseEstimator
|
|
from sklearn.calibration import CalibratedClassifierCV
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import f1_score
|
|
from sklearn.svm import LinearSVC, SVC
|
|
|
|
import quapy as qp
|
|
from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \
|
|
ClassWeightPCC, PosteriorConditionalAdjustemnt
|
|
from quapy.data import LabelledCollection
|
|
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
|
import numpy as np
|
|
from itertools import chain
|
|
import argparse
|
|
|
|
|
|
|
|
def NewClassifier(classifiername):
|
|
if classifiername== 'lr':
|
|
return LogisticRegression(class_weight='balanced')
|
|
elif classifiername== 'svm':
|
|
# return SVC(class_weight='balanced', probability=True, kernel='linear')
|
|
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
|
|
|
|
|
|
def NewQuantifier(quantifiername, classifiername):
|
|
if quantifiername == 'EMQ':
|
|
return EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
|
# return EMQ(NewClassifier(classifier))
|
|
if quantifiername == 'CC':
|
|
return CC(NewClassifier(classifiername))
|
|
if quantifiername == 'HDy':
|
|
return HDy(NewClassifier(classifiername))
|
|
if quantifiername == 'PCC':
|
|
return PCC(NewClassifier(classifiername))
|
|
if quantifiername == 'ACC':
|
|
return ACC(NewClassifier(classifiername), val_split=0.4)
|
|
if quantifiername == 'PACC':
|
|
return PACC(NewClassifier(classifiername), val_split=0.4)
|
|
if quantifiername == 'CW':
|
|
return ClassWeightPCC()
|
|
if quantifiername == 'SRSQ': # supervised regions, then single-label quantification
|
|
#q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername)))
|
|
#q = PACC(NewClassifier(classifiername), val_split=0.4)
|
|
q = ACC(NewClassifier(classifiername))
|
|
return RegionAdjustmentQ(q, k=4)
|
|
if quantifiername == 'URBQ': # unsupervised regions, then binary quantifications
|
|
def newQ():
|
|
# return PACC(NewClassifier(classifiername), val_split=0.4)
|
|
# return CC(CalibratedClassifierCV(NewClassifier(classifiername)))
|
|
return ClassWeightPCC()
|
|
return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans')
|
|
if quantifiername == 'PCAD': # posterior-conditional adjustment
|
|
return PosteriorConditionalAdjustemnt()
|
|
|
|
raise ValueError('unknown quantifier', quantifiername)
|
|
|
|
|
|
def experiment_name(args:argparse.Namespace):
|
|
return '__'.join([f'{k}:{getattr(args, k)}' for k in sorted(vars(args).keys())]) + '.csv'
|
|
|
|
|
|
def split_from_index(collection: LabelledCollection, index: np.ndarray):
|
|
in_index_set = set(index)
|
|
out_index_set = set(range(len(collection))) - in_index_set
|
|
out_index = np.asarray(sorted(out_index_set), dtype=int)
|
|
return collection.sampling_from_index(index), collection.sampling_from_index(out_index)
|
|
|
|
|
|
def move_documents(target: LabelledCollection, origin: LabelledCollection, idx_origin: np.ndarray):
|
|
# moves documents (indexed by idx_origin) from origin to target
|
|
selected, reduced_origin = split_from_index(origin, idx_origin)
|
|
enhanced_target = target + selected
|
|
return enhanced_target, reduced_origin
|
|
|
|
|
|
def uniform_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
|
return np.random.choice(len(pool), k, replace=False)
|
|
|
|
|
|
def proportional_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
|
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
|
return np.random.choice(len(pool), k, replace=False, p=prob/prob.sum())
|
|
|
|
|
|
def relevance_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
|
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
|
top_relevant_idx = np.argsort(-prob)[:k]
|
|
return top_relevant_idx
|
|
|
|
|
|
def uncertainty_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
|
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
|
top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k]
|
|
return top_uncertain_idx
|
|
|
|
|
|
def mix_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args):
|
|
relevance_idx = relevance_sampling(pool, classifier, k)
|
|
uncertanty_idx = uncertainty_sampling(pool, classifier, k)
|
|
interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
|
|
_, unique_idx = np.unique(interleave_idx, return_index=True)
|
|
top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
|
|
return top_interleaved_idx
|
|
|
|
|
|
def adaptive_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, progress: float):
|
|
relevance_k = int(k*progress/100)
|
|
uncertanty_k = k - relevance_k
|
|
relevance_idx = relevance_sampling(pool, classifier, relevance_k)
|
|
uncertainty_idx = uncertainty_sampling(pool, classifier, uncertanty_k)
|
|
idx = np.concatenate([relevance_idx, uncertainty_idx])
|
|
idx = np.unique(idx)
|
|
return idx
|
|
|
|
|
|
def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
|
|
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
|
|
top_relevant_idx = np.argsort(-prob)[:k]
|
|
return top_relevant_idx
|
|
|
|
|
|
def recall(train_prev, pool_prev, train_size, pool_size):
|
|
frac_tr_pos = train_prev[1]
|
|
frac_te_pos = pool_prev[1]
|
|
recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size)
|
|
return recall
|
|
|
|
|
|
def create_dataset(datasetname):
|
|
if datasetname == 'imdb.10K.75p':
|
|
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
|
|
collection = data.training.sampling(10000, 0.75)
|
|
return collection
|
|
|
|
elif datasetname == 'RCV1.C4':
|
|
X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
|
|
y = y.toarray()
|
|
prev = y.mean(axis=0).flatten()
|
|
# choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
|
|
# this category happens to be the cat with id 4
|
|
target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0]
|
|
print('chosen cat', target_cat)
|
|
y = y[:, target_cat].flatten()
|
|
return LabelledCollection(X, y)
|
|
|
|
elif datasetname == 'hp':
|
|
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=5)
|
|
collection = data.training + data.test
|
|
collection = LabelledCollection(instances=collection.instances, labels=1-collection.labels)
|
|
return collection
|
|
|
|
print(f'unknown dataset {datasetname}. Abort')
|
|
sys.exit(0)
|
|
|
|
|
|
def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str):
|
|
q = CC(NewClassifier(classifiername)).fit(train)
|
|
return q.quantify(pool.instances), q.learner
|
|
|
|
|
|
def estimate_prev_Q(train, pool, quantifiername, classifiername):
|
|
q = NewQuantifier(quantifiername, classifiername)
|
|
# q._find_regions((train+pool).instances)
|
|
q.fit(train)
|
|
|
|
prev = q.quantify(pool.instances)
|
|
return prev, q
|
|
|
|
|
|
def eval_classifier(learner, test:LabelledCollection):
|
|
predictions = learner.predict(test.instances)
|
|
true_labels = test.labels
|
|
f1 = f1_score(true_labels, predictions, average='binary')
|
|
return f1
|
|
|
|
|
|
def ideal_cost(classifier, pool):
|
|
# returns the cost (in terms of number of documents) to review until the last relevant document
|
|
# is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since
|
|
# one assumes to know the optimal stopping point (reached after the last relevant is encountered)
|
|
|
|
prob = classifier.predict_proba(pool.instances)
|
|
order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance"
|
|
ranked_labels = pool.labels[order]
|
|
num_relevant = np.sum(pool.labels)
|
|
idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min()
|
|
return idealized_cost |