From e1f6149f71c4d87e0845fd548f93238f4026fe56 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 2 May 2024 10:59:16 +0200 Subject: [PATCH] adding the prevalence of the judged relevant per each query --- Retrieval/commons.py | 11 +++++++++-- Retrieval/experiments.py | 24 +++++++++++++----------- Retrieval/tmp.py | 19 ++++--------------- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/Retrieval/commons.py b/Retrieval/commons.py index 26a34b4..9100b22 100644 --- a/Retrieval/commons.py +++ b/Retrieval/commons.py @@ -46,12 +46,14 @@ class RetrievedSamples: def __init__(self, class_home: str, test_rankings_path: str, + test_query_prevs_path: str, vectorizer, class_name, classes=None ): self.class_home = class_home self.test_rankings_df = pd.read_json(test_rankings_path) + self.test_query_prevs_df = pd.read_json(test_query_prevs_path) self.vectorizer = vectorizer self.class_name = class_name self.classes=classes @@ -75,10 +77,14 @@ class RetrievedSamples: # loads the test sample query_id = self._get_query_id_from_path(file) - sel_df = tests_df[tests_df.qid == int(query_id)] + sel_df = tests_df[tests_df.qid == query_id] Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes) - yield (Xtr, ytr, score_tr), (Xte, yte, score_te) + # gets the prevalence of all judged relevant documents for the query + df = self.test_query_prevs_df + q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0] + + yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs def _list_queries(self): return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json'))) @@ -109,6 +115,7 @@ class RetrievedSamples: qid = path qid = qid[:qid.index(posfix)] qid = qid[qid.index(prefix) + len(prefix):] + qid = int(qid) return qid diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py index cd7088f..3da29d3 100644 --- a/Retrieval/experiments.py +++ b/Retrieval/experiments.py @@ -61,8 +61,8 @@ def methods(classifier, class_name): yield ('CC', ClassifyAndCount(classifier)) # yield ('PCC', PCC(classifier)) # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1)) - yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) - yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1)) + yield ('PACC2', PACC(classifier, val_split=5, n_jobs=-1)) + # yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1)) # yield ('EMQ', EMQ(classifier, exact_train_prev=True)) # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt')) # yield ('EMQh', EMQ(classifier, exact_train_prev=False)) @@ -80,7 +80,7 @@ def methods(classifier, class_name): yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) - yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) + # yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) # yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) @@ -144,7 +144,7 @@ def run_experiment(): } pbar = tqdm(experiment_prot(), total=experiment_prot.total()) - for train, test in pbar: + for train, test, q_rel_prevs in pbar: Xtr, ytr, score_tr = train Xte, yte, score_te = test @@ -154,7 +154,7 @@ def run_experiment(): else: train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_) - idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr) + # idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr) if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'): quantifier.fit(train_col, val_split=train_col, fit_classifier=False) @@ -167,11 +167,11 @@ def run_experiment(): if method_name == 'NaiveQuery': train_k = reduceAtK(train_col, k) quantifier.fit(train_k) - elif method_name.endswith('-s'): - test_min_score = score_te[k] if k < len(score_te) else score_te[-1] - train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score) - print(f'{k=}, {test_min_score=} {len(train_k)=}') - quantifier.fit(train_k, val_split=train_k, fit_classifier=False) + # elif method_name.endswith('-s'): + # test_min_score = score_te[k] if k < len(score_te) else score_te[-1] + # train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score) + # print(f'{k=}, {test_min_score=} {len(train_k)=}') + # quantifier.fit(train_k, val_split=train_k, fit_classifier=False) estim_prev = quantifier.quantify(test_k.instances) @@ -245,6 +245,7 @@ if __name__ == '__main__': train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier test_rankings_path = join(data_home, 'testRanking_Results.json') + test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json') results_home = join('results'+exp_posfix, class_name, data_size) tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path) @@ -252,6 +253,7 @@ if __name__ == '__main__': experiment_prot = RetrievedSamples( class_home, test_rankings_path, + test_query_prevs_path, vectorizer=tfidf, class_name=class_name, classes=classifier_trained.classes_ @@ -273,7 +275,7 @@ if __name__ == '__main__': table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) - Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae) + Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae) diff --git a/Retrieval/tmp.py b/Retrieval/tmp.py index 6e10f87..dde3a4c 100644 --- a/Retrieval/tmp.py +++ b/Retrieval/tmp.py @@ -2,26 +2,15 @@ import pandas as pd from os.path import join -from Retrieval.commons import load_json_sample from quapy.data import LabelledCollection data_home = 'data' CLASS_NAME = 'continent' datasize = '100K' -file_path = join(data_home, CLASS_NAME, datasize, 'training_Query-84Sample-200SPLIT.json') +file_path = join(data_home, 'prevelance_vectors_judged_docs.json') -text, classes = load_json_sample(file_path, CLASS_NAME) +df = pd.read_json(file_path) - -data = LabelledCollection(text, classes) -print(data.classes_) -print(data.prevalence()) -print('done') - -test_ranking_path = join(data_home, 'testRanking_Results.json') -# obj = json.load(open(test_ranking_path)) - - -df = pd.read_json(test_ranking_path) -print('done') \ No newline at end of file +pd.set_option('display.max_columns', None) +print(df) \ No newline at end of file