first commit

This commit is contained in:
Alejandro Moreo Fernandez 2021-11-30 15:55:32 +01:00
parent 537a95fa18
commit 671ef1efea
1 changed files with 80 additions and 0 deletions

80
eDiscovery/main.py Normal file
View File

@ -0,0 +1,80 @@
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import quapy as qp
from method.base import BaseQuantifier
from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ, ClassifyAndCount, PACC
from quapy import functional as F
import numpy as np
def split_from_index(collection:LabelledCollection, index:np.ndarray):
in_index_set = set(index)
out_index_set = set(range(len(collection))) - in_index_set
out_index = np.asarray(list(out_index_set), dtype=int)
return collection.sampling_from_index(index), collection.sampling_from_index(out_index)
def relevance_sampling_index(pool:LabelledCollection, classifier:BaseEstimator, k:int):
prob = classifier.predict_proba(pool.instances)[:, 1].flatten()
top_relevant_idx = np.argsort(-prob)[:k]
return top_relevant_idx
def recall(train_prev, pool_prev, train_len, pool_len):
nD = train_len + pool_len
pTr = train_len / nD
pPool = pool_len / nD
recall = train_prev[1] * pTr / (train_prev[1] * pTr + pool_prev[1] * pPool)
return recall
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5)
# collection = data.training + data.test
collection = data.training.sampling(10000, 0.75)
nD = len(collection)
# initial labelled data selection
init_nD = 100
init_prev = 0.5
idx = collection.sampling_index(init_nD, init_prev)
train, pool = split_from_index(collection, idx)
k = 50
recall_target = 0.95
# Q = EMQ(CalibratedClassifierCV(LogisticRegression()))
# Q = ClassifyAndCount(LogisticRegression())
Q = PACC(LogisticRegression())
i = 0
while True:
Q.fit(train)
pool_p_hat = Q.quantify(pool.instances)
tr_p = train.prevalence()
te_p = pool.prevalence()
nDtr = len(train)
nDte = len(pool)
r_hat = recall(tr_p, pool_p_hat, nDtr, nDte)
r = recall(tr_p, te_p, nDtr, nDte)
r_error = abs(r_hat-r)
proc_percent = 100*nDtr/nD
print(f'{i}\t [{proc_percent:.2f}%] tr-prev={F.strprev(tr_p)} te-prev={F.strprev(te_p)} te-estim={F.strprev(pool_p_hat)} R={r:.3f} Rhat={r_hat:.3f} E={r_error:.3f}')
# if r_hat >= recall_target:
if proc_percent > 95:
break
top_relevant_idx = relevance_sampling_index(pool, Q.learner, k)
selected, pool = split_from_index(pool, top_relevant_idx)
train = train + selected
i += 1