import os.path import sys import sklearn from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import quapy as qp from method.base import BaseQuantifier from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy import functional as F import numpy as np from itertools import chain def split_from_index(collection: LabelledCollection, index: np.ndarray): in_index_set = set(index) out_index_set = set(range(len(collection))) - in_index_set out_index = np.asarray(list(out_index_set), dtype=int) return collection.sampling_from_index(index), collection.sampling_from_index(out_index) def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k] return top_uncertain_idx def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): relevance_idx = relevance_sampling_index(pool, classifier, k) uncertanty_idx = uncertainty_sampling_index(pool, classifier, k) interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx)))) _, unique_idx = np.unique(interleave_idx, return_index=True) top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k] return top_interleaved_idx def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int): prob = classifier.predict_proba(pool.instances)[:, 0].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx def recall(train_prev, pool_prev, train_size, pool_size): frac_tr_pos = train_prev[1] frac_te_pos = pool_prev[1] recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size) return recall def NewClassifier(): # return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) return LogisticRegression(class_weight=None) def NewQuantifier(): return EMQ(CalibratedClassifierCV(NewClassifier())) def create_dataset(datasetname): if datasetname == 'imdb.10K.75p': data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) collection = data.training.sampling(10000, 0.75) return collection elif datasetname == 'RCV1.C4': X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True) y = y.toarray() prev = y.mean(axis=0).flatten() # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery) # this category happens to be the cat with id 4 target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0] print('chosen cat', target_cat) y = y[:, target_cat].flatten() return LabelledCollection(X, y) def estimate_prev_CC(train, pool): q = CC(NewClassifier()).fit(train) return q.quantify(pool.instances), q.learner def estimate_prev_Q(train, pool, classifier): # q = qp.model_selection.GridSearchQ( # ACC(LogisticRegression()), # param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']}, # sample_size=len(train), # protocol='app', # n_prevpoints=21, # n_repetitions=10) q = NewQuantifier() # q = ACC(NewClassifier()) # borrow (supposedly negative) pool documents # train_pos = train.counts()[1] # train_negs = train.counts()[0] # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5)) # neg_sample = pool.sampling_from_index(neg_idx) # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample)) # q.fit(train_augmented) q.fit(train) # q.fit(first_train) # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten() prev = q.quantify(pool.instances) return prev, None # return q.quantify(pool_instances), None def tee(msg): foo.write(msg + '\n') foo.flush() print(msg) datasetname = 'RCV1.C4' collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname) nD = len(collection) # initial labelled data selection init_nD = 1000 init_prev = [0.5, 0.5] idx = collection.sampling_index(init_nD, *init_prev) train, pool = split_from_index(collection, idx) #first_train = LabelledCollection(train.instances, train.labels) k = 100 recall_target = 0.99 outputdir = './results' qp.util.create_if_not_exist(outputdir) # sampling_fn, sampling_name = relevance_sampling_index, 'relevance' sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix' q_name = NewQuantifier().__class__.__name__ experiment_suffix = f'{sampling_name}_{q_name}' i = 0 with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo: tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC') while True: pool_p_hat_cc, classifier = estimate_prev_CC(train, pool) pool_p_hat, _ = estimate_prev_Q(train, pool, classifier) tr_p = train.prevalence() te_p = pool.prevalence() nDtr = len(train) nDte = len(pool) r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte) r_hat = recall(tr_p, pool_p_hat, nDtr, nDte) r = recall(tr_p, te_p, nDtr, nDte) tr_te_shift = qp.error.ae(tr_p, te_p) progress = 100 * nDtr / nD q_ae = qp.error.ae(te_p, pool_p_hat) cc_ae = qp.error.ae(te_p, pool_p_hat_cc) tee( f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}') if nDte < k: break top_relevant_idx = sampling_fn(pool, classifier, k) selected, pool = split_from_index(pool, top_relevant_idx) train = train + selected i += 1