adding the prevalence of the judged relevant per each query
This commit is contained in:
parent
a1a716dc4a
commit
e1f6149f71
Retrieval
|
@ -46,12 +46,14 @@ class RetrievedSamples:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
class_home: str,
|
class_home: str,
|
||||||
test_rankings_path: str,
|
test_rankings_path: str,
|
||||||
|
test_query_prevs_path: str,
|
||||||
vectorizer,
|
vectorizer,
|
||||||
class_name,
|
class_name,
|
||||||
classes=None
|
classes=None
|
||||||
):
|
):
|
||||||
self.class_home = class_home
|
self.class_home = class_home
|
||||||
self.test_rankings_df = pd.read_json(test_rankings_path)
|
self.test_rankings_df = pd.read_json(test_rankings_path)
|
||||||
|
self.test_query_prevs_df = pd.read_json(test_query_prevs_path)
|
||||||
self.vectorizer = vectorizer
|
self.vectorizer = vectorizer
|
||||||
self.class_name = class_name
|
self.class_name = class_name
|
||||||
self.classes=classes
|
self.classes=classes
|
||||||
|
@ -75,10 +77,14 @@ class RetrievedSamples:
|
||||||
|
|
||||||
# loads the test sample
|
# loads the test sample
|
||||||
query_id = self._get_query_id_from_path(file)
|
query_id = self._get_query_id_from_path(file)
|
||||||
sel_df = tests_df[tests_df.qid == int(query_id)]
|
sel_df = tests_df[tests_df.qid == query_id]
|
||||||
Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes)
|
Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes)
|
||||||
|
|
||||||
yield (Xtr, ytr, score_tr), (Xte, yte, score_te)
|
# gets the prevalence of all judged relevant documents for the query
|
||||||
|
df = self.test_query_prevs_df
|
||||||
|
q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0]
|
||||||
|
|
||||||
|
yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs
|
||||||
|
|
||||||
def _list_queries(self):
|
def _list_queries(self):
|
||||||
return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
|
return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
|
||||||
|
@ -109,6 +115,7 @@ class RetrievedSamples:
|
||||||
qid = path
|
qid = path
|
||||||
qid = qid[:qid.index(posfix)]
|
qid = qid[:qid.index(posfix)]
|
||||||
qid = qid[qid.index(prefix) + len(prefix):]
|
qid = qid[qid.index(prefix) + len(prefix):]
|
||||||
|
qid = int(qid)
|
||||||
return qid
|
return qid
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,8 +61,8 @@ def methods(classifier, class_name):
|
||||||
yield ('CC', ClassifyAndCount(classifier))
|
yield ('CC', ClassifyAndCount(classifier))
|
||||||
# yield ('PCC', PCC(classifier))
|
# yield ('PCC', PCC(classifier))
|
||||||
# yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
|
# yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
|
||||||
yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
|
yield ('PACC2', PACC(classifier, val_split=5, n_jobs=-1))
|
||||||
yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1))
|
# yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1))
|
||||||
# yield ('EMQ', EMQ(classifier, exact_train_prev=True))
|
# yield ('EMQ', EMQ(classifier, exact_train_prev=True))
|
||||||
# yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
|
# yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
|
||||||
# yield ('EMQh', EMQ(classifier, exact_train_prev=False))
|
# yield ('EMQh', EMQ(classifier, exact_train_prev=False))
|
||||||
|
@ -80,7 +80,7 @@ def methods(classifier, class_name):
|
||||||
yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
|
yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
|
||||||
# yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005))
|
# yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005))
|
||||||
yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
|
yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
|
||||||
yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
|
# yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
|
||||||
# yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
|
# yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
|
||||||
# yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
|
# yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
|
||||||
# yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
|
# yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
|
||||||
|
@ -144,7 +144,7 @@ def run_experiment():
|
||||||
}
|
}
|
||||||
|
|
||||||
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
|
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
|
||||||
for train, test in pbar:
|
for train, test, q_rel_prevs in pbar:
|
||||||
Xtr, ytr, score_tr = train
|
Xtr, ytr, score_tr = train
|
||||||
Xte, yte, score_te = test
|
Xte, yte, score_te = test
|
||||||
|
|
||||||
|
@ -154,7 +154,7 @@ def run_experiment():
|
||||||
else:
|
else:
|
||||||
train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
|
train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
|
||||||
|
|
||||||
idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr)
|
# idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr)
|
||||||
|
|
||||||
if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'):
|
if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'):
|
||||||
quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
|
quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
|
||||||
|
@ -167,11 +167,11 @@ def run_experiment():
|
||||||
if method_name == 'NaiveQuery':
|
if method_name == 'NaiveQuery':
|
||||||
train_k = reduceAtK(train_col, k)
|
train_k = reduceAtK(train_col, k)
|
||||||
quantifier.fit(train_k)
|
quantifier.fit(train_k)
|
||||||
elif method_name.endswith('-s'):
|
# elif method_name.endswith('-s'):
|
||||||
test_min_score = score_te[k] if k < len(score_te) else score_te[-1]
|
# test_min_score = score_te[k] if k < len(score_te) else score_te[-1]
|
||||||
train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score)
|
# train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score)
|
||||||
print(f'{k=}, {test_min_score=} {len(train_k)=}')
|
# print(f'{k=}, {test_min_score=} {len(train_k)=}')
|
||||||
quantifier.fit(train_k, val_split=train_k, fit_classifier=False)
|
# quantifier.fit(train_k, val_split=train_k, fit_classifier=False)
|
||||||
|
|
||||||
estim_prev = quantifier.quantify(test_k.instances)
|
estim_prev = quantifier.quantify(test_k.instances)
|
||||||
|
|
||||||
|
@ -245,6 +245,7 @@ if __name__ == '__main__':
|
||||||
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
|
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
|
||||||
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier
|
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier
|
||||||
test_rankings_path = join(data_home, 'testRanking_Results.json')
|
test_rankings_path = join(data_home, 'testRanking_Results.json')
|
||||||
|
test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
|
||||||
results_home = join('results'+exp_posfix, class_name, data_size)
|
results_home = join('results'+exp_posfix, class_name, data_size)
|
||||||
|
|
||||||
tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path)
|
tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path)
|
||||||
|
@ -252,6 +253,7 @@ if __name__ == '__main__':
|
||||||
experiment_prot = RetrievedSamples(
|
experiment_prot = RetrievedSamples(
|
||||||
class_home,
|
class_home,
|
||||||
test_rankings_path,
|
test_rankings_path,
|
||||||
|
test_query_prevs_path,
|
||||||
vectorizer=tfidf,
|
vectorizer=tfidf,
|
||||||
class_name=class_name,
|
class_name=class_name,
|
||||||
classes=classifier_trained.classes_
|
classes=classifier_trained.classes_
|
||||||
|
|
|
@ -2,26 +2,15 @@ import pandas as pd
|
||||||
|
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
|
||||||
from Retrieval.commons import load_json_sample
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
|
|
||||||
data_home = 'data'
|
data_home = 'data'
|
||||||
CLASS_NAME = 'continent'
|
CLASS_NAME = 'continent'
|
||||||
datasize = '100K'
|
datasize = '100K'
|
||||||
|
|
||||||
file_path = join(data_home, CLASS_NAME, datasize, 'training_Query-84Sample-200SPLIT.json')
|
file_path = join(data_home, 'prevelance_vectors_judged_docs.json')
|
||||||
|
|
||||||
text, classes = load_json_sample(file_path, CLASS_NAME)
|
df = pd.read_json(file_path)
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
data = LabelledCollection(text, classes)
|
print(df)
|
||||||
print(data.classes_)
|
|
||||||
print(data.prevalence())
|
|
||||||
print('done')
|
|
||||||
|
|
||||||
test_ranking_path = join(data_home, 'testRanking_Results.json')
|
|
||||||
# obj = json.load(open(test_ranking_path))
|
|
||||||
|
|
||||||
|
|
||||||
df = pd.read_json(test_ranking_path)
|
|
||||||
print('done')
|
|
Loading…
Reference in New Issue