import os.path import sys import sklearn from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from sklearn.svm import OneClassSVM, LinearSVC import quapy as qp from method.base import BaseQuantifier from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC from quapy import functional as F import numpy as np def split_from_index(collection:LabelledCollection, index:np.ndarray): in_index_set = set(index) out_index_set = set(range(len(collection))) - in_index_set out_index = np.asarray(list(out_index_set), dtype=int) return collection.sampling_from_index(index), collection.sampling_from_index(out_index) def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int): prob = classifier.predict_proba(pool.instances)[:, 1].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx def negative_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int): prob = classifier.predict_proba(pool.instances)[:, 0].flatten() top_relevant_idx = np.argsort(-prob)[:k] return top_relevant_idx def recall(train_prev, pool_prev, train_size, pool_size): frac_tr_pos = train_prev[1] frac_te_pos = pool_prev[1] recall = (frac_tr_pos * train_size) / (frac_tr_pos * train_size + frac_te_pos * pool_size) return recall def NewClassifier(): return CalibratedClassifierCV(LinearSVC(class_weight='balanced')) def create_dataset(datasetname): if datasetname=='imdb.10K.75p': data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) collection = data.training.sampling(10000, 0.75) return collection elif datasetname=='RCV1.C4': X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True) y = y.toarray() prev = y.mean(axis=0).flatten() # choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery) # this category happens to be the cat with id 4 target_cat = np.argwhere(np.logical_and(prev>0.1, prev<0.2)).flatten()[0] print('chosen cat', target_cat) y = y[:, target_cat].flatten() return LabelledCollection(X, y) def estimate_prev_CC(train, pool): q = CC(NewClassifier()).fit(train) return q.quantify(pool.instances), q.learner def estimate_prev_Q(train, pool, classifier): # q = qp.model_selection.GridSearchQ( # ACC(LogisticRegression()), # param_grid={'C':np.logspace(-3,3,7), 'class_weight':[None, 'balanced']}, # sample_size=len(train), # protocol='app', # n_prevpoints=21, # n_repetitions=10) q = ACC(NewClassifier()) # borrow (supposedly negative) pool documents # train_pos = train.counts()[1] # train_negs = train.counts()[0] # neg_idx = negative_sampling_index(pool, classifier, max(train_pos-train_negs, 5)) # neg_sample = pool.sampling_from_index(neg_idx) # train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample)) # q.fit(train_augmented) # q.fit(train) # q.fit(first_train) # bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten() prev = q.quantify(pool.instances) return prev, None # return q.quantify(pool_instances), None def tee(msg): foo.write(msg+'\n') foo.flush() print(msg) datasetname = 'RCV1.C4' collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname) # data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5) # collection = data.training + data.test # collection = data.training.sampling(10000, 0.75) nD = len(collection) # initial labelled data selection init_nD = 1000 init_prev = [0.5, 0.5] idx = collection.sampling_index(init_nD, *init_prev) train, pool = split_from_index(collection, idx) first_train = LabelledCollection(train.instances, train.labels) k = 25 recall_target = 0.95 # Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC" # CC = ClassifyAndCount(LogisticRegression(class_weight='balanced')) # Q, q_name = qp.model_selection.GridSearchQ( # PACC(LogisticRegression(), val_split=3), # param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, # sample_size=1000, # protocol='app', # n_prevpoints=21, # n_repetitions=10), "PACC" # Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC' # CC = ClassifyAndCount(LogisticRegression(class_weight='balanced')) outputdir = './results' qp.util.create_if_not_exist(outputdir) i = 0 with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo: tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC') while True: pool_p_hat_cc, classifier = estimate_prev_CC(train, pool) pool_p_hat, _ = estimate_prev_Q(train, pool, classifier) tr_p = train.prevalence() te_p = pool.prevalence() nDtr = len(train) nDte = len(pool) r_hat_cc = recall(tr_p, pool_p_hat_cc, nDtr, nDte) r_hat = recall(tr_p, pool_p_hat, nDtr, nDte) r = recall(tr_p, te_p, nDtr, nDte) tr_te_shift = qp.error.ae(tr_p, te_p) proc_percent = 100*nDtr/nD q_ae = qp.error.ae(te_p, pool_p_hat) cc_ae = qp.error.ae(te_p, pool_p_hat_cc) tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}' f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}') if nDte < k: break top_relevant_idx = relevance_sampling_index(pool, classifier, k) selected, pool = split_from_index(pool, top_relevant_idx) train = train + selected i += 1