import sys import sklearn from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score from sklearn.svm import LinearSVC, SVC import quapy as qp from eDiscovery.method import RegionAdjustment, RegionProbAdjustment, RegionProbAdjustmentGlobal, RegionAdjustmentQ, \ ClassWeightPCC, PosteriorConditionalAdjustemnt from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC import numpy as np from itertools import chain import argparse def NewClassifier(classifiername): if classifiername== 'lr': return LogisticRegression(class_weight='balanced') elif classifiername== 'svm': # return SVC(class_weight='balanced', probability=True, kernel='linear') return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) def NewQuantifier(quantifiername, classifiername): if quantifiername == 'EMQ': return EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) # return EMQ(NewClassifier(classifier)) if quantifiername == 'CC': return CC(NewClassifier(classifiername)) if quantifiername == 'HDy': return HDy(NewClassifier(classifiername)) if quantifiername == 'PCC': return PCC(NewClassifier(classifiername)) if quantifiername == 'ACC': return ACC(NewClassifier(classifiername), val_split=0.4) if quantifiername == 'PACC': return PACC(NewClassifier(classifiername), val_split=0.4) if quantifiername == 'CW': return ClassWeightPCC() if quantifiername == 'SRSQ': # supervised regions, then single-label quantification #q = EMQ(CalibratedClassifierCV(NewClassifier(classifiername))) #q = PACC(NewClassifier(classifiername), val_split=0.4) q = ACC(NewClassifier(classifiername)) return RegionAdjustmentQ(q, k=4) if quantifiername == 'URBQ': # unsupervised regions, then binary quantifications def newQ(): # return PACC(NewClassifier(classifiername), val_split=0.4) # return CC(CalibratedClassifierCV(NewClassifier(classifiername))) return ClassWeightPCC() return RegionProbAdjustmentGlobal(newQ, k=10, clustering='kmeans') if quantifiername == 'PCAD': # posterior-conditional adjustment return PosteriorConditionalAdjustemnt() raise ValueError('unknown quantifier', quantifiername) def experiment_name(args:argparse.Namespace): return '__'.join([f'{k}:{getattr(args, k)}' for k in sorted(vars(args).keys())]) + '.csv' def split_from_index(collection: LabelledCollection, index: np.ndarray): in_index_set = set(index) out_index_set = set(range(len(collection))) - in_index_set out_index = np.asarray(sorted(out_index_set), dtype=int) return collection.sampling_from_index(index), collection.sampling_from_index(out_index) def move_documents(target: LabelledCollection, origin: LabelledCollection, idx_origin: np.ndarray): # moves documents (indexed by idx_origin) from origin to target selected, reduced_origin = split_from_index(origin, idx_origin) enhanced_target = target + selected return enhanced_target, reduced_origin def uniform_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args): return np.random.choice(len(pool), k, replace=False) def proportional_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() return np.random.choice(len(pool), k, replace=False, p=prob/prob.sum()) def relevance_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx def uncertainty_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k] return top_uncertain_idx def mix_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, *args): relevance_idx = relevance_sampling(pool, classifier, k) uncertanty_idx = uncertainty_sampling(pool, classifier, k) interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx)))) _, unique_idx = np.unique(interleave_idx, return_index=True) top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k] return top_interleaved_idx def adaptive_sampling(pool: LabelledCollection, classifier: BaseEstimator, k: int, progress: float): relevance_k = int(k*progress/100) uncertanty_k = k - relevance_k relevance_idx = relevance_sampling(pool, classifier, relevance_k) uncertainty_idx = uncertainty_sampling(pool, classifier, uncertanty_k) idx = np.concatenate([relevance_idx, uncertainty_idx]) idx = np.unique(idx) return idx def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): prob = classifier.predict_proba(pool.instances)[:, 0].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx def recall(train_prev, pool_prev, train_size, pool_size): frac_tr_pos = train_prev[1] frac_te_pos = pool_prev[1] recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size) return recall def create_dataset(datasetname): if datasetname == 'imdb.10K.75p': data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) collection = data.training.sampling(10000, 0.75) return collection elif datasetname == 'RCV1.C4': X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True) y = y.toarray() prev = y.mean(axis=0).flatten() # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery) # this category happens to be the cat with id 4 target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0] print('chosen cat', target_cat) y = y[:, target_cat].flatten() return LabelledCollection(X, y) elif datasetname == 'hp': data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=5) collection = data.training + data.test collection = LabelledCollection(instances=collection.instances, labels=1-collection.labels) return collection print(f'unknown dataset {datasetname}. Abort') sys.exit(0) def estimate_prev_CC(train, pool: LabelledCollection, classifiername:str): q = CC(NewClassifier(classifiername)).fit(train) return q.quantify(pool.instances), q.learner def estimate_prev_Q(train, pool, quantifiername, classifiername): q = NewQuantifier(quantifiername, classifiername) # q._find_regions((train+pool).instances) q.fit(train) prev = q.quantify(pool.instances) return prev, q def eval_classifier(learner, test:LabelledCollection): predictions = learner.predict(test.instances) true_labels = test.labels f1 = f1_score(true_labels, predictions, average='binary') return f1 def ideal_cost(classifier, pool): # returns the cost (in terms of number of documents) to review until the last relevant document # is processed, assuming the rank produced by this classifier. The cost is said to be "idealized" since # one assumes to know the optimal stopping point (reached after the last relevant is encountered) prob = classifier.predict_proba(pool.instances) order = np.argsort(prob[:,0]) # col 0 has negative posterior prob, so the natural order is "by relevance" ranked_labels = pool.labels[order] num_relevant = np.sum(pool.labels) idealized_cost = np.argwhere(np.cumsum(ranked_labels)==num_relevant).min() return idealized_cost