forked from moreo/QuaPy
mixing relevance and uncertainty sampling
This commit is contained in:
parent
47b71bd5f2
commit
b051ed4781
|
@ -6,7 +6,7 @@ import sklearn
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import OneClassSVM, LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from method.base import BaseQuantifier
|
from method.base import BaseQuantifier
|
||||||
|
@ -14,22 +14,38 @@ from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
from quapy.method.aggregative import EMQ, CC, PACC, PCC, HDy, ACC
|
||||||
from quapy import functional as F
|
from quapy import functional as F
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
|
||||||
def split_from_index(collection:LabelledCollection, index:np.ndarray):
|
def split_from_index(collection: LabelledCollection, index: np.ndarray):
|
||||||
in_index_set = set(index)
|
in_index_set = set(index)
|
||||||
out_index_set = set(range(len(collection))) - in_index_set
|
out_index_set = set(range(len(collection))) - in_index_set
|
||||||
out_index = np.asarray(list(out_index_set), dtype=int)
|
out_index = np.asarray(list(out_index_set), dtype=int)
|
||||||
return collection.sampling_from_index(index), collection.sampling_from_index(out_index)
|
return collection.sampling_from_index(index), collection.sampling_from_index(out_index)
|
||||||
|
|
||||||
|
|
||||||
def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int):
|
def relevance_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
|
||||||
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
||||||
top_relevant_idx = np.argsort(-prob)[:k]
|
top_relevant_idx = np.argsort(-prob)[:k]
|
||||||
return top_relevant_idx
|
return top_relevant_idx
|
||||||
|
|
||||||
|
|
||||||
def negative_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int):
|
def uncertainty_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
|
||||||
|
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
|
||||||
|
top_uncertain_idx = np.argsort(np.abs(prob - 0.5))[:k]
|
||||||
|
return top_uncertain_idx
|
||||||
|
|
||||||
|
|
||||||
|
def mix_rel_unc_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
|
||||||
|
relevance_idx = relevance_sampling_index(pool, classifier, k)
|
||||||
|
uncertanty_idx = uncertainty_sampling_index(pool, classifier, k)
|
||||||
|
interleave_idx = np.asarray(list(chain.from_iterable(zip(relevance_idx, uncertanty_idx))))
|
||||||
|
_, unique_idx = np.unique(interleave_idx, return_index=True)
|
||||||
|
top_interleaved_idx = interleave_idx[np.sort(unique_idx)][:k]
|
||||||
|
return top_interleaved_idx
|
||||||
|
|
||||||
|
|
||||||
|
def negative_sampling_index(pool: LabelledCollection, classifier: BaseEstimator, k: int):
|
||||||
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
|
prob = classifier.predict_proba(pool.instances)[:, 0].flatten()
|
||||||
top_relevant_idx = np.argsort(-prob)[:k]
|
top_relevant_idx = np.argsort(-prob)[:k]
|
||||||
return top_relevant_idx
|
return top_relevant_idx
|
||||||
|
@ -43,22 +59,27 @@ def recall(train_prev, pool_prev, train_size, pool_size):
|
||||||
|
|
||||||
|
|
||||||
def NewClassifier():
|
def NewClassifier():
|
||||||
return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
|
# return CalibratedClassifierCV(LinearSVC(class_weight='balanced'))
|
||||||
|
return LogisticRegression(class_weight=None)
|
||||||
|
|
||||||
|
|
||||||
|
def NewQuantifier():
|
||||||
|
return EMQ(CalibratedClassifierCV(NewClassifier()))
|
||||||
|
|
||||||
|
|
||||||
def create_dataset(datasetname):
|
def create_dataset(datasetname):
|
||||||
if datasetname=='imdb.10K.75p':
|
if datasetname == 'imdb.10K.75p':
|
||||||
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
|
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
|
||||||
collection = data.training.sampling(10000, 0.75)
|
collection = data.training.sampling(10000, 0.75)
|
||||||
return collection
|
return collection
|
||||||
|
|
||||||
elif datasetname=='RCV1.C4':
|
elif datasetname == 'RCV1.C4':
|
||||||
X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
|
X, y = sklearn.datasets.fetch_rcv1(subset='train', return_X_y=True)
|
||||||
y = y.toarray()
|
y = y.toarray()
|
||||||
prev = y.mean(axis=0).flatten()
|
prev = y.mean(axis=0).flatten()
|
||||||
# choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
|
# choose the first category having a positive prevalence between [0.1,0.2] (realistic scenario for e-Discovery)
|
||||||
# this category happens to be the cat with id 4
|
# this category happens to be the cat with id 4
|
||||||
target_cat = np.argwhere(np.logical_and(prev>0.1, prev<0.2)).flatten()[0]
|
target_cat = np.argwhere(np.logical_and(prev > 0.1, prev < 0.2)).flatten()[0]
|
||||||
print('chosen cat', target_cat)
|
print('chosen cat', target_cat)
|
||||||
y = y[:, target_cat].flatten()
|
y = y[:, target_cat].flatten()
|
||||||
return LabelledCollection(X, y)
|
return LabelledCollection(X, y)
|
||||||
|
@ -78,7 +99,8 @@ def estimate_prev_Q(train, pool, classifier):
|
||||||
# n_prevpoints=21,
|
# n_prevpoints=21,
|
||||||
# n_repetitions=10)
|
# n_repetitions=10)
|
||||||
|
|
||||||
q = ACC(NewClassifier())
|
q = NewQuantifier()
|
||||||
|
# q = ACC(NewClassifier())
|
||||||
# borrow (supposedly negative) pool documents
|
# borrow (supposedly negative) pool documents
|
||||||
# train_pos = train.counts()[1]
|
# train_pos = train.counts()[1]
|
||||||
# train_negs = train.counts()[0]
|
# train_negs = train.counts()[0]
|
||||||
|
@ -86,7 +108,7 @@ def estimate_prev_Q(train, pool, classifier):
|
||||||
# neg_sample = pool.sampling_from_index(neg_idx)
|
# neg_sample = pool.sampling_from_index(neg_idx)
|
||||||
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
|
# train_augmented = train + LabelledCollection(neg_sample.instances, [0]*len(neg_sample))
|
||||||
# q.fit(train_augmented)
|
# q.fit(train_augmented)
|
||||||
# q.fit(train)
|
q.fit(train)
|
||||||
# q.fit(first_train)
|
# q.fit(first_train)
|
||||||
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
|
# bootstrap_prev = qp.evaluation.natural_prevalence_prediction(q, pool, sample_size=len(train), n_repetitions=50)[1].mean(axis=0).flatten()
|
||||||
prev = q.quantify(pool.instances)
|
prev = q.quantify(pool.instances)
|
||||||
|
@ -95,17 +117,13 @@ def estimate_prev_Q(train, pool, classifier):
|
||||||
|
|
||||||
|
|
||||||
def tee(msg):
|
def tee(msg):
|
||||||
foo.write(msg+'\n')
|
foo.write(msg + '\n')
|
||||||
foo.flush()
|
foo.flush()
|
||||||
print(msg)
|
print(msg)
|
||||||
|
|
||||||
|
|
||||||
datasetname = 'RCV1.C4'
|
datasetname = 'RCV1.C4'
|
||||||
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
|
collection = qp.util.pickled_resource(f'./dataset/{datasetname}.pkl', create_dataset, datasetname)
|
||||||
|
|
||||||
# data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
|
|
||||||
# collection = data.training + data.test
|
|
||||||
# collection = data.training.sampling(10000, 0.75)
|
|
||||||
nD = len(collection)
|
nD = len(collection)
|
||||||
|
|
||||||
# initial labelled data selection
|
# initial labelled data selection
|
||||||
|
@ -113,30 +131,22 @@ init_nD = 1000
|
||||||
init_prev = [0.5, 0.5]
|
init_prev = [0.5, 0.5]
|
||||||
idx = collection.sampling_index(init_nD, *init_prev)
|
idx = collection.sampling_index(init_nD, *init_prev)
|
||||||
train, pool = split_from_index(collection, idx)
|
train, pool = split_from_index(collection, idx)
|
||||||
first_train = LabelledCollection(train.instances, train.labels)
|
#first_train = LabelledCollection(train.instances, train.labels)
|
||||||
|
|
||||||
k = 25
|
|
||||||
recall_target = 0.95
|
|
||||||
|
|
||||||
# Q, q_name = ClassifyAndCount(CalibratedClassifierCV(LinearSVC())), "CC"
|
|
||||||
# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
|
|
||||||
|
|
||||||
# Q, q_name = qp.model_selection.GridSearchQ(
|
|
||||||
# PACC(LogisticRegression(), val_split=3),
|
|
||||||
# param_grid={'C':np.logspace(-2,2,5), 'class_weight':[None, 'balanced']},
|
|
||||||
# sample_size=1000,
|
|
||||||
# protocol='app',
|
|
||||||
# n_prevpoints=21,
|
|
||||||
# n_repetitions=10), "PACC"
|
|
||||||
# Q, q_name = PACC(LogisticRegression(class_weight='balanced')), 'PACC'
|
|
||||||
# CC = ClassifyAndCount(LogisticRegression(class_weight='balanced'))
|
|
||||||
|
|
||||||
|
k = 100
|
||||||
|
recall_target = 0.99
|
||||||
|
|
||||||
outputdir = './results'
|
outputdir = './results'
|
||||||
qp.util.create_if_not_exist(outputdir)
|
qp.util.create_if_not_exist(outputdir)
|
||||||
|
|
||||||
|
# sampling_fn, sampling_name = relevance_sampling_index, 'relevance'
|
||||||
|
sampling_fn, sampling_name = mix_rel_unc_sampling_index, 'mix'
|
||||||
|
q_name = NewQuantifier().__class__.__name__
|
||||||
|
|
||||||
|
experiment_suffix = f'{sampling_name}_{q_name}'
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo:
|
with open(os.path.join(outputdir, f'{datasetname}_{experiment_suffix}.csv'), 'wt') as foo:
|
||||||
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
|
tee('it\t%\ttr-size\tte-size\ttr-prev\tte-prev\tte-estim\tte-estimCC\tR\tRhat\tRhatCC\tShift\tAE\tAE_CC')
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
|
@ -153,23 +163,20 @@ with open(os.path.join(outputdir, f'{datasetname}.csv'), 'wt') as foo:
|
||||||
r = recall(tr_p, te_p, nDtr, nDte)
|
r = recall(tr_p, te_p, nDtr, nDte)
|
||||||
tr_te_shift = qp.error.ae(tr_p, te_p)
|
tr_te_shift = qp.error.ae(tr_p, te_p)
|
||||||
|
|
||||||
proc_percent = 100*nDtr/nD
|
progress = 100 * nDtr / nD
|
||||||
|
|
||||||
q_ae = qp.error.ae(te_p, pool_p_hat)
|
q_ae = qp.error.ae(te_p, pool_p_hat)
|
||||||
cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
|
cc_ae = qp.error.ae(te_p, pool_p_hat_cc)
|
||||||
|
|
||||||
tee(f'{i}\t{proc_percent:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
tee(
|
||||||
|
f'{i}\t{progress:.2f}\t{nDtr}\t{nDte}\t{tr_p[1]:.3f}\t{te_p[1]:.3f}\t{pool_p_hat[1]:.3f}\t{pool_p_hat_cc[1]:.3f}'
|
||||||
f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
|
f'\t{r:.3f}\t{r_hat:.3f}\t{r_hat_cc:.3f}\t{tr_te_shift:.5f}\t{q_ae:.4f}\t{cc_ae:.4f}')
|
||||||
|
|
||||||
if nDte < k:
|
if nDte < k:
|
||||||
break
|
break
|
||||||
|
|
||||||
top_relevant_idx = relevance_sampling_index(pool, classifier, k)
|
top_relevant_idx = sampling_fn(pool, classifier, k)
|
||||||
selected, pool = split_from_index(pool, top_relevant_idx)
|
selected, pool = split_from_index(pool, top_relevant_idx)
|
||||||
train = train + selected
|
train = train + selected
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,49 +1,61 @@
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import sys, os, pathlib
|
||||||
|
|
||||||
|
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
|
||||||
|
|
||||||
|
file = sys.argv[1] #'./results/RCV1.C4.csv'
|
||||||
|
loop = bool(int(sys.argv[2]))
|
||||||
|
|
||||||
|
plotname = pathlib.Path(file).name.replace(".csv", ".png")
|
||||||
|
|
||||||
file = './results/RCV1.C4.csv'
|
|
||||||
# plot the data
|
# plot the data
|
||||||
fig, axs = plt.subplots(3)
|
fig, axs = plt.subplots(3)
|
||||||
|
|
||||||
while True:
|
try:
|
||||||
df = pd.read_csv(file, sep='\t')
|
while True:
|
||||||
|
df = pd.read_csv(file, sep='\t')
|
||||||
|
|
||||||
xs = df['it']
|
xs = df['it']
|
||||||
|
|
||||||
y_r = df['R']
|
y_r = df['R']
|
||||||
y_rhat = df['Rhat']
|
y_rhat = df['Rhat']
|
||||||
y_rhatCC = df['RhatCC']
|
y_rhatCC = df['RhatCC']
|
||||||
label='R'
|
label='R'
|
||||||
axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$')
|
axs[0].plot(xs, y_rhat, label='$\hat{'+label+'}$')
|
||||||
axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$')
|
axs[0].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$')
|
||||||
axs[0].plot(xs, y_r, label=label)
|
axs[0].plot(xs, y_r, label=label)
|
||||||
axs[0].legend()
|
axs[0].legend()
|
||||||
axs[0].grid()
|
axs[0].grid()
|
||||||
|
|
||||||
y_r = df['te-prev']
|
y_r = df['te-prev']
|
||||||
y_rhat = df['te-estim']
|
y_rhat = df['te-estim']
|
||||||
y_rhatCC = df['te-estimCC']
|
y_rhatCC = df['te-estimCC']
|
||||||
label='P'
|
label='P'
|
||||||
axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$')
|
axs[1].plot(xs, y_rhat, label='$\hat{'+label+'}$')
|
||||||
axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$')
|
axs[1].plot(xs, y_rhatCC, label='$\hat{'+label+'}_{CC}$')
|
||||||
axs[1].plot(xs, y_r, label=label)
|
axs[1].plot(xs, y_r, label=label)
|
||||||
axs[1].legend()
|
axs[1].legend()
|
||||||
axs[1].grid()
|
axs[1].grid()
|
||||||
|
|
||||||
y_ae = df['AE']
|
y_ae = df['AE']
|
||||||
y_ae_cc = df['AE_CC']
|
y_ae_cc = df['AE_CC']
|
||||||
axs[2].plot(xs, y_ae, label='AE')
|
axs[2].plot(xs, y_ae, label='AE')
|
||||||
axs[2].plot(xs, y_ae_cc, label='AE-CC')
|
axs[2].plot(xs, y_ae_cc, label='AE-CC')
|
||||||
axs[2].legend()
|
axs[2].legend()
|
||||||
axs[2].grid()
|
axs[2].grid()
|
||||||
|
|
||||||
#plt.pause(1.0)
|
os.makedirs('./plots', exist_ok=True)
|
||||||
axs[0].cla()
|
plt.savefig(f'./plots/{plotname}')
|
||||||
axs[1].cla()
|
|
||||||
axs[2].cla()
|
|
||||||
|
|
||||||
plt.savefig('./plot.png')
|
if not loop:
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
plt.pause(.5)
|
||||||
|
axs[0].cla()
|
||||||
|
axs[1].cla()
|
||||||
|
axs[2].cla()
|
||||||
|
|
||||||
#plt.show()
|
except KeyboardInterrupt:
|
||||||
|
print("\n[exit]")
|
Loading…
Reference in New Issue