diff --git a/LeQua2024/_lequa2024.py b/LeQua2024/_lequa2024.py index 285549d..b4b6c82 100644 --- a/LeQua2024/_lequa2024.py +++ b/LeQua2024/_lequa2024.py @@ -1,15 +1,20 @@ +import zipfile + import pandas as pd import os from os.path import join +import quapy as qp from scripts.data import load_vector_documents from quapy.data import LabelledCollection from quapy.protocol import AbstractProtocol - +from quapy.util import download_file_if_not_exists LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4'] +LEQUA2024_ZENODO = 'https://zenodo.org/record/11091067' # v2, no ground truth for test yet + class LabelledCollectionsFromDir(AbstractProtocol): @@ -25,34 +30,35 @@ class LabelledCollectionsFromDir(AbstractProtocol): yield lc -def fetch_lequa2024(task, data_home='./data', merge_T3=False): +def fetch_lequa2024(task, data_home=None, merge_T3=False): from quapy.data._lequa2022 import SamplesFromDir assert task in LEQUA2024_TASKS, \ f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}' - # if data_home is None: - # data_home = get_quapy_home() + if data_home is None: + data_home = qp.util.get_quapy_home() + lequa_dir = data_home - # URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip' - # URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip' - # URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip' + URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip' + URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip' + # URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip' - # lequa_dir = join(data_home, 'lequa2024') - # os.makedirs(lequa_dir, exist_ok=True) + lequa_dir = join(data_home, 'lequa2024') + os.makedirs(lequa_dir, exist_ok=True) - # def download_unzip_and_remove(unzipped_path, url): - # tmp_path = join(lequa_dir, task + '_tmp.zip') - # download_file_if_not_exists(url, tmp_path) - # with zipfile.ZipFile(tmp_path) as file: - # file.extractall(unzipped_path) - # os.remove(tmp_path) + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(lequa_dir, task + '_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) - # if not os.path.exists(join(lequa_dir, task)): - # download_unzip_and_remove(lequa_dir, URL_TRAINDEV) - # download_unzip_and_remove(lequa_dir, URL_TEST) + if not os.path.exists(join(lequa_dir, task)): + download_unzip_and_remove(lequa_dir, URL_TRAINDEV) + download_unzip_and_remove(lequa_dir, URL_TEST) # download_unzip_and_remove(lequa_dir, URL_TEST_PREV) load_fn = load_vector_documents diff --git a/LeQua2024/baselines.py b/LeQua2024/baselines.py index ff4c33c..400f298 100644 --- a/LeQua2024/baselines.py +++ b/LeQua2024/baselines.py @@ -1,11 +1,12 @@ import argparse import pickle import os +import sys from os.path import join from sklearn.linear_model import LogisticRegression as LR from scripts.constants import SAMPLE_SIZE -from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024 +from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO from quapy.method.aggregative import * from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE import quapy.functional as F @@ -18,11 +19,11 @@ BINARY_TASKS = ['T1', 'T4'] def new_cls(): - return LR(n_jobs=-1) + return LR(n_jobs=-1, max_iter=3000) lr_params = { - 'C': np.logspace(-3, 3, 7), + 'C': np.logspace(-4, 4, 9), 'class_weight': [None, 'balanced'] } @@ -37,19 +38,23 @@ def baselines(): yield CC(new_cls()), "CC", q_params yield ACC(new_cls()), "ACC", q_params - yield PCC(new_cls()), "PCC", q_params - yield PACC(new_cls()), "PACC", q_params - yield EMQ(CalibratedClassifierCV(new_cls())), "SLD-Platt", wrap_params(wrap_params(lr_params, 'estimator'), 'classifier') - yield EMQ(new_cls()), "SLD", q_params + # yield PCC(new_cls()), "PCC", q_params + # yield PACC(new_cls()), "PACC", q_params + # yield EMQ(CalibratedClassifierCV(new_cls())), "SLD-Platt", wrap_params(wrap_params(lr_params, 'estimator'), 'classifier') + # yield EMQ(new_cls()), "SLD", q_params # yield EMQ(new_cls()), "SLD-BCTS", {**q_params, 'recalib': ['bcts'], 'val_split': [5]} - yield MLPE(), "MLPE", None - if args.task in BINARY_TASKS: - yield MS2(new_cls()), "MedianSweep2", q_params + # yield MLPE(), "MLPE", None + # if args.task in BINARY_TASKS: + # yield MS2(new_cls()), "MedianSweep2", q_params def main(args): models_path = qp.util.create_if_not_exist(join('./models', args.task)) + hyperparams_path = qp.util.create_if_not_exist(join('./hyperparams', args.task)) + + os.makedirs(models_path, exist_ok=True) + os.makedirs(hyperparams_path, exist_ok=True) qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE[args.task] @@ -63,12 +68,15 @@ def main(args): for quantifier, q_name, param_grid in baselines(): model_path = os.path.join(models_path, q_name + '.pkl') + modelparams_path = os.path.join(hyperparams_path, q_name + '.pkl') if os.path.exists(model_path): print(f'a pickle for {q_name} exists already in {model_path}; skipping!') continue + print(f'starting model fitting for {q_name}') + if param_grid is not None: - quantifier = qp.model_selection.GridSearchQ( + optimizer = qp.model_selection.GridSearchQ( quantifier, param_grid, protocol=gen_val, @@ -77,13 +85,14 @@ def main(args): verbose=True, n_jobs=-1 ).fit(train) - print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') - quantifier = quantifier.best_model() + print(f'{q_name} got MRAE={optimizer.best_score_:.5f} (hyper-params: {optimizer.best_params_})') + quantifier = optimizer.best_model() else: quantifier.fit(train) print(f'saving model in {model_path}') pickle.dump(quantifier, open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(quantifier.get_params(), open(modelparams_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) if __name__ == '__main__': @@ -92,7 +101,8 @@ if __name__ == '__main__': parser.add_argument('task', metavar='TASK', type=str, choices=LEQUA2024_TASKS, help=f'Code of the task; available ones are {LEQUA2024_TASKS}') parser.add_argument('datadir', metavar='DATA-PATH', type=str, - help='Path of the directory containing LeQua 2024 data', default='./data') + help='Path of the directory containing LeQua 2024 data (default is ./data)', + default='./data') args = parser.parse_args() main(args) diff --git a/LeQua2024/predict.py b/LeQua2024/predict.py index ee6fc0a..d1129b6 100644 --- a/LeQua2024/predict.py +++ b/LeQua2024/predict.py @@ -21,7 +21,8 @@ def main(args): # check the number of samples nsamples = len(glob(os.path.join(args.samples, f'*.txt'))) if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}: - print(f'Warning: The number of samples (.txt) in {args.samples} does neither coincide with the expected number of ' + print(f'Warning: The number of samples (.txt) in {args.samples} ' + f'does neither coincide with the expected number of ' f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of ' f'test samples ({constants.TEST_SAMPLES}).') @@ -39,7 +40,7 @@ def main(args): if __name__=='__main__': - parser = argparse.ArgumentParser(description='LeQua2022 prediction script') + parser = argparse.ArgumentParser(description='LeQua2024 prediction script') parser.add_argument('model', metavar='MODEL-PATH', type=str, help='Path of saved model') parser.add_argument('samples', metavar='SAMPLES-PATH', type=str, diff --git a/LeQua2024/run_baselines.sh b/LeQua2024/run_baselines.sh index d4fc761..92e1abd 100755 --- a/LeQua2024/run_baselines.sh +++ b/LeQua2024/run_baselines.sh @@ -7,29 +7,47 @@ set -x # T3: ordinal (n=5) # T4: covariante shift (n=2) -# -------------------------------------------------------------------------------- -# DEV -# -------------------------------------------------------------------------------- +# preparing the environment: downloads the official LeQua 2024 scripts (only once and for all) +SCRIPTS_URL=https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip + +# download and unzip the LeQua 2024 scripts + +if [ ! -d "./scripts" ]; then + echo "Downloading $SCRIPTS_URL into ./scripts" + wget -qO scripts.zip "$SCRIPTS_URL" + unzip -q scripts.zip + mv "LeQua2024_scripts-main" "scripts" + rm scripts.zip + echo "[Done]" +else + echo "LeQua 2024 scripts already exists" +fi -mkdir results for task in T1 T2 T3 T4 ; do - echo "" > results/$task.txt - PYTHONPATH=.:scripts/:.. python3 baselines.py $task data/ - SAMPLES=data/$task/public/dev_samples - TRUEPREVS=data/$task/public/dev_prevalences.txt + TEST_SAMPLES=data/lequa2024/$task/public/test_samples for pickledmodel in models/$task/*.pkl ; do model=$(basename "$pickledmodel" .pkl) - - PREDICTIONS=predictions/$task/$model.txt - - PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $SAMPLES $PREDICTIONS - PYTHONPATH=.:scripts/:.. python3 scripts/evaluate.py $task $TRUEPREVS $PREDICTIONS >> results/$task.txt + PREDICTIONS=predictions/$model/task_"${task: -1}".csv + PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $TEST_SAMPLES $PREDICTIONS done done +echo "generating submission files for codalab in folder ./submission_files" + +mkdir -p submission_files + +for modelname in predictions/* ; do + modelname=$(basename "$modelname") + submission_name=submission_files/$modelname.zip + rm -f $submission_name + echo "zipping results for $modelname" + zip -j $submission_name predictions/$modelname/task_*.csv +done + +echo "[Done]"