From fd1ab667ac97c2a5cda9e524b489b079c42cf4da Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Thu, 2 May 2024 09:28:50 +0200
Subject: [PATCH] preparing baselines

---
 LeQua2024/_lequa2024.py    | 42 ++++++++++++++++++++----------------
 LeQua2024/baselines.py     | 38 ++++++++++++++++++++------------
 LeQua2024/predict.py       |  5 +++--
 LeQua2024/run_baselines.sh | 44 +++++++++++++++++++++++++++-----------
 4 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/LeQua2024/_lequa2024.py b/LeQua2024/_lequa2024.py
index 285549d..b4b6c82 100644
--- a/LeQua2024/_lequa2024.py
+++ b/LeQua2024/_lequa2024.py
@@ -1,15 +1,20 @@
+import zipfile
+
 import pandas as pd
 import os
 from os.path import join
 
+import quapy as qp
 from scripts.data import load_vector_documents
 
 from quapy.data import LabelledCollection
 from quapy.protocol import AbstractProtocol
-
+from quapy.util import download_file_if_not_exists
 
 LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
 
+LEQUA2024_ZENODO = 'https://zenodo.org/record/11091067'  # v2, no ground truth for test yet
+
 
 class LabelledCollectionsFromDir(AbstractProtocol):
 
@@ -25,34 +30,35 @@ class LabelledCollectionsFromDir(AbstractProtocol):
             yield lc
 
 
-def fetch_lequa2024(task, data_home='./data', merge_T3=False):
+def fetch_lequa2024(task, data_home=None, merge_T3=False):
 
     from quapy.data._lequa2022 import SamplesFromDir
 
     assert task in LEQUA2024_TASKS, \
         f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
 
-    # if data_home is None:
-    #     data_home = get_quapy_home()
+    if data_home is None:
+        data_home = qp.util.get_quapy_home()
+
     lequa_dir = data_home
 
-    # URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
-    # URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
-    # URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
+    URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip'
+    URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip'
+    # URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip'
 
-    # lequa_dir = join(data_home, 'lequa2024')
-    # os.makedirs(lequa_dir, exist_ok=True)
+    lequa_dir = join(data_home, 'lequa2024')
+    os.makedirs(lequa_dir, exist_ok=True)
 
-    # def download_unzip_and_remove(unzipped_path, url):
-    #     tmp_path = join(lequa_dir, task + '_tmp.zip')
-    #     download_file_if_not_exists(url, tmp_path)
-    #     with zipfile.ZipFile(tmp_path) as file:
-    #         file.extractall(unzipped_path)
-    #     os.remove(tmp_path)
+    def download_unzip_and_remove(unzipped_path, url):
+        tmp_path = join(lequa_dir, task + '_tmp.zip')
+        download_file_if_not_exists(url, tmp_path)
+        with zipfile.ZipFile(tmp_path) as file:
+            file.extractall(unzipped_path)
+        os.remove(tmp_path)
 
-    # if not os.path.exists(join(lequa_dir, task)):
-    #     download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
-    #     download_unzip_and_remove(lequa_dir, URL_TEST)
+    if not os.path.exists(join(lequa_dir, task)):
+        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
+        download_unzip_and_remove(lequa_dir, URL_TEST)
     #     download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
 
     load_fn = load_vector_documents
diff --git a/LeQua2024/baselines.py b/LeQua2024/baselines.py
index ff4c33c..400f298 100644
--- a/LeQua2024/baselines.py
+++ b/LeQua2024/baselines.py
@@ -1,11 +1,12 @@
 import argparse
 import pickle
 import os
+import sys
 from os.path import join
 from sklearn.linear_model import LogisticRegression as LR
 
 from scripts.constants import SAMPLE_SIZE
-from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024
+from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
 from quapy.method.aggregative import *
 from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
 import quapy.functional as F
@@ -18,11 +19,11 @@ BINARY_TASKS = ['T1', 'T4']
 
 
 def new_cls():
-    return LR(n_jobs=-1)
+    return LR(n_jobs=-1, max_iter=3000)
 
 
 lr_params = {
-    'C': np.logspace(-3, 3, 7),
+    'C': np.logspace(-4, 4, 9),
     'class_weight': [None, 'balanced']
 }
 
@@ -37,19 +38,23 @@ def baselines():
 
     yield CC(new_cls()), "CC", q_params
     yield ACC(new_cls()), "ACC", q_params
-    yield PCC(new_cls()), "PCC", q_params
-    yield PACC(new_cls()), "PACC", q_params
-    yield EMQ(CalibratedClassifierCV(new_cls())), "SLD-Platt", wrap_params(wrap_params(lr_params, 'estimator'), 'classifier')
-    yield EMQ(new_cls()), "SLD", q_params
+    # yield PCC(new_cls()), "PCC", q_params
+    # yield PACC(new_cls()), "PACC", q_params
+    # yield EMQ(CalibratedClassifierCV(new_cls())), "SLD-Platt", wrap_params(wrap_params(lr_params, 'estimator'), 'classifier')
+    # yield EMQ(new_cls()), "SLD", q_params
     # yield EMQ(new_cls()), "SLD-BCTS", {**q_params, 'recalib': ['bcts'], 'val_split': [5]}
-    yield MLPE(), "MLPE", None
-    if args.task in BINARY_TASKS:
-        yield MS2(new_cls()), "MedianSweep2", q_params
+    # yield MLPE(), "MLPE", None
+    # if args.task in BINARY_TASKS:
+    #     yield MS2(new_cls()), "MedianSweep2", q_params
 
 
 def main(args):
 
     models_path = qp.util.create_if_not_exist(join('./models', args.task))
+    hyperparams_path = qp.util.create_if_not_exist(join('./hyperparams', args.task))
+
+    os.makedirs(models_path, exist_ok=True)
+    os.makedirs(hyperparams_path, exist_ok=True)
 
     qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE[args.task]
 
@@ -63,12 +68,15 @@ def main(args):
     for quantifier, q_name, param_grid in baselines():
 
         model_path = os.path.join(models_path, q_name + '.pkl')
+        modelparams_path = os.path.join(hyperparams_path, q_name + '.pkl')
         if os.path.exists(model_path):
             print(f'a pickle for {q_name} exists already in {model_path}; skipping!')
             continue
 
+        print(f'starting model fitting for {q_name}')
+
         if param_grid is not None:
-            quantifier = qp.model_selection.GridSearchQ(
+            optimizer = qp.model_selection.GridSearchQ(
                 quantifier,
                 param_grid,
                 protocol=gen_val,
@@ -77,13 +85,14 @@ def main(args):
                 verbose=True,
                 n_jobs=-1
             ).fit(train)
-            print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
-            quantifier = quantifier.best_model()
+            print(f'{q_name} got MRAE={optimizer.best_score_:.5f} (hyper-params: {optimizer.best_params_})')
+            quantifier = optimizer.best_model()
         else:
             quantifier.fit(train)
 
         print(f'saving model in {model_path}')
         pickle.dump(quantifier, open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
+        pickle.dump(quantifier.get_params(), open(modelparams_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
 
 
 if __name__ == '__main__':
@@ -92,7 +101,8 @@ if __name__ == '__main__':
     parser.add_argument('task', metavar='TASK', type=str, choices=LEQUA2024_TASKS,
                         help=f'Code of the task; available ones are {LEQUA2024_TASKS}')
     parser.add_argument('datadir', metavar='DATA-PATH', type=str,
-                        help='Path of the directory containing LeQua 2024 data', default='./data')
+                        help='Path of the directory containing LeQua 2024 data (default is ./data)',
+                        default='./data')
     args = parser.parse_args()
 
     main(args)
diff --git a/LeQua2024/predict.py b/LeQua2024/predict.py
index ee6fc0a..d1129b6 100644
--- a/LeQua2024/predict.py
+++ b/LeQua2024/predict.py
@@ -21,7 +21,8 @@ def main(args):
     # check the number of samples
     nsamples = len(glob(os.path.join(args.samples, f'*.txt')))
     if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}:
-        print(f'Warning: The number of samples (.txt) in {args.samples} does neither coincide with the expected number of '
+        print(f'Warning: The number of samples (.txt) in {args.samples} '
+              f'does neither coincide with the expected number of '
               f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
               f'test samples ({constants.TEST_SAMPLES}).')
 
@@ -39,7 +40,7 @@ def main(args):
 
 
 if __name__=='__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 prediction script')
+    parser = argparse.ArgumentParser(description='LeQua2024 prediction script')
     parser.add_argument('model', metavar='MODEL-PATH', type=str,
                         help='Path of saved model')
     parser.add_argument('samples', metavar='SAMPLES-PATH', type=str,
diff --git a/LeQua2024/run_baselines.sh b/LeQua2024/run_baselines.sh
index d4fc761..92e1abd 100755
--- a/LeQua2024/run_baselines.sh
+++ b/LeQua2024/run_baselines.sh
@@ -7,29 +7,47 @@ set -x
 # T3: ordinal (n=5)
 # T4: covariante shift (n=2)
 
-# --------------------------------------------------------------------------------
-# DEV
-# --------------------------------------------------------------------------------
+# preparing the environment: downloads the official LeQua 2024 scripts (only once and for all)
+SCRIPTS_URL=https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip
+
+# download and unzip the LeQua 2024 scripts
+
+if [ ! -d "./scripts" ]; then
+    echo "Downloading $SCRIPTS_URL into ./scripts"
+    wget -qO scripts.zip "$SCRIPTS_URL"
+    unzip -q scripts.zip
+    mv "LeQua2024_scripts-main" "scripts"
+    rm scripts.zip
+    echo "[Done]"
+else
+    echo "LeQua 2024 scripts already exists"
+fi
 
-mkdir results
 
 for task in T1 T2 T3 T4 ; do
 
-  echo "" > results/$task.txt
-
   PYTHONPATH=.:scripts/:.. python3 baselines.py $task data/
 
-  SAMPLES=data/$task/public/dev_samples
-  TRUEPREVS=data/$task/public/dev_prevalences.txt
+  TEST_SAMPLES=data/lequa2024/$task/public/test_samples
 
   for pickledmodel in models/$task/*.pkl ; do
     model=$(basename "$pickledmodel" .pkl)
-
-    PREDICTIONS=predictions/$task/$model.txt
-
-    PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $SAMPLES $PREDICTIONS
-    PYTHONPATH=.:scripts/:.. python3 scripts/evaluate.py $task $TRUEPREVS $PREDICTIONS >> results/$task.txt
+    PREDICTIONS=predictions/$model/task_"${task: -1}".csv
+    PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $TEST_SAMPLES $PREDICTIONS
   done
 
 done
 
+echo "generating submission files for codalab in folder ./submission_files"
+
+mkdir -p submission_files
+
+for modelname in predictions/* ; do
+  modelname=$(basename "$modelname")
+  submission_name=submission_files/$modelname.zip
+  rm -f $submission_name
+  echo "zipping results for $modelname"
+  zip -j $submission_name predictions/$modelname/task_*.csv
+done
+
+echo "[Done]"