1
0
Fork 0

mixing relevance and uncertainty sampling

This commit is contained in:
Alejandro Moreo Fernandez 2022-01-14 20:17:21 +01:00
parent 47b71bd5f2
commit b051ed4781
2 changed files with 94 additions and 75 deletions

View File

@ -6,7 +6,7 @@ import sklearn
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import OneClassSVM, LinearSVC
from sklearn.svm import LinearSVC
import quapy as qp
from method.base import BaseQuantifier
@ -14,6 +14,7 @@ from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
from quapy import functional as F
import numpy as np
from itertools import chain
def split_from_index(collection: LabelledCollection, index: np.ndarray):
@ -29,6 +30,21 @@ def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator,
return top_relevant_idx
def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k]
return top_uncertain_idx
def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
relevance_idx = relevance_sampling_index(pool, classifier, k)
uncertanty_idx = uncertainty_sampling_index(pool, classifier, k)
interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
_, unique_idx = np.unique(interleave_idx, return_index=True)
top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
return top_interleaved_idx
def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
top_relevant_idx = np.argsort(-prob)[:k]
@ -43,7 +59,12 @@ def recall(train_prev, pool_prev, train_size, pool_size):
def NewClassifier():
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
# return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
return LogisticRegression(class_weight=None)
def NewQuantifier():
return EMQ(CalibratedClassifierCV(NewClassifier()))
def create_dataset(datasetname):
@ -78,7 +99,8 @@ def estimate_prev_Q(train, pool, classifier):
# n_prevpoints=21,
# n_repetitions=10)
q = ACC(NewClassifier())
q = NewQuantifier()
# q = ACC(NewClassifier())
# borrow (supposedly negative) pool documents
# train_pos = train.counts()[1]
# train_negs = train.counts()[0]
@ -86,7 +108,7 @@ def estimate_prev_Q(train, pool, classifier):
# neg_sample = pool.sampling_from_index(neg_idx)
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
# q.fit(train_augmented)
# q.fit(train)
q.fit(train)
# q.fit(first_train)
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
prev = q.quantify(pool.instances)
@ -102,10 +124,6 @@ def tee(msg):
datasetname = 'RCV1.C4'
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
# data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
# collection = data.training + data.test
# collection = data.training.sampling(10000, 0.75)
nD = len(collection)
# initial labelled data selection
@ -113,30 +131,22 @@ init_nD = 1000
init_prev = [0.5, 0.5]
idx = collection.sampling_index(init_nD, *init_prev)
train, pool = split_from_index(collection, idx)
first_train = LabelledCollection(train.instances, train.labels)
k = 25
recall_target = 0.95
# Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC"
# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
# Q, q_name = qp.model_selection.GridSearchQ(
# PACC(LogisticRegression(), val_split=3),
# param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']},
# sample_size=1000,
# protocol='app',
# n_prevpoints=21,
# n_repetitions=10), "PACC"
# Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC'
# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
#first_train = LabelledCollection(train.instances, train.labels)
k = 100
recall_target = 0.99
outputdir = './results'
qp.util.create_if_not_exist(outputdir)
# sampling_fn, sampling_name = relevance_sampling_index, 'relevance'
sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix'
q_name = NewQuantifier().__class__.__name__
experiment_suffix = f'{sampling_name}_{q_name}'
i = 0
with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo:
with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo:
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
while True:
@ -153,23 +163,20 @@ with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo:
r = recall(tr_p, te_p, nDtr, nDte)
tr_te_shift = qp.error.ae(tr_p, te_p)
proc_percent = 100*nDtr/nD
progress = 100 * nDtr / nD
q_ae = qp.error.ae(te_p, pool_p_hat)
cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
tee(
f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
if nDte < k:
break
top_relevant_idx = relevance_sampling_index(pool, classifier, k)
top_relevant_idx = sampling_fn(pool, classifier, k)
selected, pool = split_from_index(pool, top_relevant_idx)
train = train + selected
i += 1

View File

@ -1,11 +1,19 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys, os, pathlib
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = sys.argv[1] #'./results/RCV1.C4.csv'
loop = bool(int(sys.argv[2]))
plotname = pathlib.Path(file).name.replace(".csv", ".png")
file = './results/RCV1.C4.csv'
# plot the data
fig, axs = plt.subplots(3)
try:
while True:
df = pd.read_csv(file, sep='\t')
@ -38,12 +46,16 @@ while True:
axs[2].legend()
axs[2].grid()
#plt.pause(1.0)
os.makedirs('./plots', exist_ok=True)
plt.savefig(f'./plots/{plotname}')
if not loop:
break
else:
plt.pause(.5)
axs[0].cla()
axs[1].cla()
axs[2].cla()
plt.savefig('./plot.png')
break
#plt.show()
except KeyboardInterrupt:
print("\n[exit]")