From e1f6149f71c4d87e0845fd548f93238f4026fe56 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Thu, 2 May 2024 10:59:16 +0200
Subject: [PATCH] adding the prevalence of the judged relevant per each query

---
 Retrieval/commons.py     | 11 +++++++++--
 Retrieval/experiments.py | 24 +++++++++++++-----------
 Retrieval/tmp.py         | 19 ++++---------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/Retrieval/commons.py b/Retrieval/commons.py
index 26a34b4..9100b22 100644
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@@ -46,12 +46,14 @@ class RetrievedSamples:
     def __init__(self,
                  class_home: str,
                  test_rankings_path: str,
+                 test_query_prevs_path: str,
                  vectorizer,
                  class_name,
                  classes=None
                  ):
         self.class_home = class_home
         self.test_rankings_df = pd.read_json(test_rankings_path)
+        self.test_query_prevs_df = pd.read_json(test_query_prevs_path)
         self.vectorizer = vectorizer
         self.class_name = class_name
         self.classes=classes
@@ -75,10 +77,14 @@ class RetrievedSamples:
 
                 # loads the test sample
                 query_id = self._get_query_id_from_path(file)
-                sel_df = tests_df[tests_df.qid == int(query_id)]
+                sel_df = tests_df[tests_df.qid == query_id]
                 Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes)
 
-                yield (Xtr, ytr, score_tr), (Xte, yte, score_te)
+                # gets the prevalence of all judged relevant documents for the query
+                df = self.test_query_prevs_df
+                q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0]
+
+                yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs
 
     def _list_queries(self):
         return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
@@ -109,6 +115,7 @@ class RetrievedSamples:
         qid = path
         qid = qid[:qid.index(posfix)]
         qid = qid[qid.index(prefix) + len(prefix):]
+        qid = int(qid)
         return qid
 
 
diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py
index cd7088f..3da29d3 100644
--- a/Retrieval/experiments.py
+++ b/Retrieval/experiments.py
@@ -61,8 +61,8 @@ def methods(classifier, class_name):
     yield ('CC', ClassifyAndCount(classifier))
     # yield ('PCC', PCC(classifier))
     # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
-    yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
-    yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1))
+    yield ('PACC2', PACC(classifier, val_split=5, n_jobs=-1))
+    # yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1))
     # yield ('EMQ', EMQ(classifier, exact_train_prev=True))
     # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
     # yield ('EMQh', EMQ(classifier, exact_train_prev=False))
@@ -80,7 +80,7 @@ def methods(classifier, class_name):
     yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
     # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005))
     yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
-    yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
+    # yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
     # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
     # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
     # yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
@@ -144,7 +144,7 @@ def run_experiment():
     }
 
     pbar = tqdm(experiment_prot(), total=experiment_prot.total())
-    for train, test in pbar:
+    for train, test, q_rel_prevs in pbar:
         Xtr, ytr, score_tr = train
         Xte, yte, score_te = test
 
@@ -154,7 +154,7 @@ def run_experiment():
         else:
             train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
 
-        idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr)
+        # idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr)
 
         if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'):
             quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
@@ -167,11 +167,11 @@ def run_experiment():
             if method_name == 'NaiveQuery':
                 train_k = reduceAtK(train_col, k)
                 quantifier.fit(train_k)
-            elif method_name.endswith('-s'):
-                test_min_score = score_te[k] if k < len(score_te) else score_te[-1]
-                train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score)
-                print(f'{k=}, {test_min_score=} {len(train_k)=}')
-                quantifier.fit(train_k, val_split=train_k, fit_classifier=False)
+            # elif method_name.endswith('-s'):
+            #     test_min_score = score_te[k] if k < len(score_te) else score_te[-1]
+            #     train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score)
+            #     print(f'{k=}, {test_min_score=} {len(train_k)=}')
+            #     quantifier.fit(train_k, val_split=train_k, fit_classifier=False)
 
             estim_prev = quantifier.quantify(test_k.instances)
 
@@ -245,6 +245,7 @@ if __name__ == '__main__':
             train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json')  # <-------- fixed classifier
             classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')  # <------------ fixed classifier
             test_rankings_path = join(data_home, 'testRanking_Results.json')
+            test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
             results_home = join('results'+exp_posfix, class_name, data_size)
 
             tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path)
@@ -252,6 +253,7 @@ if __name__ == '__main__':
             experiment_prot = RetrievedSamples(
                 class_home,
                 test_rankings_path,
+                test_query_prevs_path,
                 vectorizer=tfidf,
                 class_name=class_name,
                 classes=classifier_trained.classes_
@@ -273,7 +275,7 @@ if __name__ == '__main__':
                     table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
 
 
-            Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae)
+        Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae)
 
 
 
diff --git a/Retrieval/tmp.py b/Retrieval/tmp.py
index 6e10f87..dde3a4c 100644
--- a/Retrieval/tmp.py
+++ b/Retrieval/tmp.py
@@ -2,26 +2,15 @@ import pandas as pd
 
 from os.path import join
 
-from Retrieval.commons import load_json_sample
 from quapy.data import LabelledCollection
 
 data_home = 'data'
 CLASS_NAME = 'continent'
 datasize = '100K'
 
-file_path = join(data_home, CLASS_NAME, datasize, 'training_Query-84Sample-200SPLIT.json')
+file_path = join(data_home, 'prevelance_vectors_judged_docs.json')
 
-text, classes = load_json_sample(file_path, CLASS_NAME)
+df = pd.read_json(file_path)
 
-
-data = LabelledCollection(text, classes)
-print(data.classes_)
-print(data.prevalence())
-print('done')
-
-test_ranking_path = join(data_home, 'testRanking_Results.json')
-# obj = json.load(open(test_ranking_path))
-
-
-df = pd.read_json(test_ranking_path)
-print('done')
\ No newline at end of file
+pd.set_option('display.max_columns', None)
+print(df)
\ No newline at end of file