preparing baselines
This commit is contained in:
parent
6c5bd674ea
commit
fd1ab667ac
|
@ -1,15 +1,20 @@
|
||||||
|
import zipfile
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
from scripts.data import load_vector_documents
|
from scripts.data import load_vector_documents
|
||||||
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.protocol import AbstractProtocol
|
from quapy.protocol import AbstractProtocol
|
||||||
|
from quapy.util import download_file_if_not_exists
|
||||||
|
|
||||||
LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
|
LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
|
||||||
|
|
||||||
|
LEQUA2024_ZENODO = 'https://zenodo.org/record/11091067' # v2, no ground truth for test yet
|
||||||
|
|
||||||
|
|
||||||
class LabelledCollectionsFromDir(AbstractProtocol):
|
class LabelledCollectionsFromDir(AbstractProtocol):
|
||||||
|
|
||||||
|
@ -25,34 +30,35 @@ class LabelledCollectionsFromDir(AbstractProtocol):
|
||||||
yield lc
|
yield lc
|
||||||
|
|
||||||
|
|
||||||
def fetch_lequa2024(task, data_home='./data', merge_T3=False):
|
def fetch_lequa2024(task, data_home=None, merge_T3=False):
|
||||||
|
|
||||||
from quapy.data._lequa2022 import SamplesFromDir
|
from quapy.data._lequa2022 import SamplesFromDir
|
||||||
|
|
||||||
assert task in LEQUA2024_TASKS, \
|
assert task in LEQUA2024_TASKS, \
|
||||||
f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
|
f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
|
||||||
|
|
||||||
# if data_home is None:
|
if data_home is None:
|
||||||
# data_home = get_quapy_home()
|
data_home = qp.util.get_quapy_home()
|
||||||
|
|
||||||
lequa_dir = data_home
|
lequa_dir = data_home
|
||||||
|
|
||||||
# URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
|
URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip'
|
||||||
# URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
|
URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip'
|
||||||
# URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
|
# URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip'
|
||||||
|
|
||||||
# lequa_dir = join(data_home, 'lequa2024')
|
lequa_dir = join(data_home, 'lequa2024')
|
||||||
# os.makedirs(lequa_dir, exist_ok=True)
|
os.makedirs(lequa_dir, exist_ok=True)
|
||||||
|
|
||||||
# def download_unzip_and_remove(unzipped_path, url):
|
def download_unzip_and_remove(unzipped_path, url):
|
||||||
# tmp_path = join(lequa_dir, task + '_tmp.zip')
|
tmp_path = join(lequa_dir, task + '_tmp.zip')
|
||||||
# download_file_if_not_exists(url, tmp_path)
|
download_file_if_not_exists(url, tmp_path)
|
||||||
# with zipfile.ZipFile(tmp_path) as file:
|
with zipfile.ZipFile(tmp_path) as file:
|
||||||
# file.extractall(unzipped_path)
|
file.extractall(unzipped_path)
|
||||||
# os.remove(tmp_path)
|
os.remove(tmp_path)
|
||||||
|
|
||||||
# if not os.path.exists(join(lequa_dir, task)):
|
if not os.path.exists(join(lequa_dir, task)):
|
||||||
# download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
|
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
|
||||||
# download_unzip_and_remove(lequa_dir, URL_TEST)
|
download_unzip_and_remove(lequa_dir, URL_TEST)
|
||||||
# download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
# download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
||||||
|
|
||||||
load_fn = load_vector_documents
|
load_fn = load_vector_documents
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
import argparse
|
import argparse
|
||||||
import pickle
|
import pickle
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from sklearn.linear_model import LogisticRegression as LR
|
from sklearn.linear_model import LogisticRegression as LR
|
||||||
|
|
||||||
from scripts.constants import SAMPLE_SIZE
|
from scripts.constants import SAMPLE_SIZE
|
||||||
from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024
|
from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
@ -18,11 +19,11 @@ BINARY_TASKS = ['T1', 'T4']
|
||||||
|
|
||||||
|
|
||||||
def new_cls():
|
def new_cls():
|
||||||
return LR(n_jobs=-1)
|
return LR(n_jobs=-1, max_iter=3000)
|
||||||
|
|
||||||
|
|
||||||
lr_params = {
|
lr_params = {
|
||||||
'C': np.logspace(-3, 3, 7),
|
'C': np.logspace(-4, 4, 9),
|
||||||
'class_weight': [None, 'balanced']
|
'class_weight': [None, 'balanced']
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,19 +38,23 @@ def baselines():
|
||||||
|
|
||||||
yield CC(new_cls()), "CC", q_params
|
yield CC(new_cls()), "CC", q_params
|
||||||
yield ACC(new_cls()), "ACC", q_params
|
yield ACC(new_cls()), "ACC", q_params
|
||||||
yield PCC(new_cls()), "PCC", q_params
|
# yield PCC(new_cls()), "PCC", q_params
|
||||||
yield PACC(new_cls()), "PACC", q_params
|
# yield PACC(new_cls()), "PACC", q_params
|
||||||
yield EMQ(CalibratedClassifierCV(new_cls())), "SLD-Platt", wrap_params(wrap_params(lr_params, 'estimator'), 'classifier')
|
# yield EMQ(CalibratedClassifierCV(new_cls())), "SLD-Platt", wrap_params(wrap_params(lr_params, 'estimator'), 'classifier')
|
||||||
yield EMQ(new_cls()), "SLD", q_params
|
# yield EMQ(new_cls()), "SLD", q_params
|
||||||
# yield EMQ(new_cls()), "SLD-BCTS", {**q_params, 'recalib': ['bcts'], 'val_split': [5]}
|
# yield EMQ(new_cls()), "SLD-BCTS", {**q_params, 'recalib': ['bcts'], 'val_split': [5]}
|
||||||
yield MLPE(), "MLPE", None
|
# yield MLPE(), "MLPE", None
|
||||||
if args.task in BINARY_TASKS:
|
# if args.task in BINARY_TASKS:
|
||||||
yield MS2(new_cls()), "MedianSweep2", q_params
|
# yield MS2(new_cls()), "MedianSweep2", q_params
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
||||||
models_path = qp.util.create_if_not_exist(join('./models', args.task))
|
models_path = qp.util.create_if_not_exist(join('./models', args.task))
|
||||||
|
hyperparams_path = qp.util.create_if_not_exist(join('./hyperparams', args.task))
|
||||||
|
|
||||||
|
os.makedirs(models_path, exist_ok=True)
|
||||||
|
os.makedirs(hyperparams_path, exist_ok=True)
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE[args.task]
|
qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE[args.task]
|
||||||
|
|
||||||
|
@ -63,12 +68,15 @@ def main(args):
|
||||||
for quantifier, q_name, param_grid in baselines():
|
for quantifier, q_name, param_grid in baselines():
|
||||||
|
|
||||||
model_path = os.path.join(models_path, q_name + '.pkl')
|
model_path = os.path.join(models_path, q_name + '.pkl')
|
||||||
|
modelparams_path = os.path.join(hyperparams_path, q_name + '.pkl')
|
||||||
if os.path.exists(model_path):
|
if os.path.exists(model_path):
|
||||||
print(f'a pickle for {q_name} exists already in {model_path}; skipping!')
|
print(f'a pickle for {q_name} exists already in {model_path}; skipping!')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
print(f'starting model fitting for {q_name}')
|
||||||
|
|
||||||
if param_grid is not None:
|
if param_grid is not None:
|
||||||
quantifier = qp.model_selection.GridSearchQ(
|
optimizer = qp.model_selection.GridSearchQ(
|
||||||
quantifier,
|
quantifier,
|
||||||
param_grid,
|
param_grid,
|
||||||
protocol=gen_val,
|
protocol=gen_val,
|
||||||
|
@ -77,13 +85,14 @@ def main(args):
|
||||||
verbose=True,
|
verbose=True,
|
||||||
n_jobs=-1
|
n_jobs=-1
|
||||||
).fit(train)
|
).fit(train)
|
||||||
print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
|
print(f'{q_name} got MRAE={optimizer.best_score_:.5f} (hyper-params: {optimizer.best_params_})')
|
||||||
quantifier = quantifier.best_model()
|
quantifier = optimizer.best_model()
|
||||||
else:
|
else:
|
||||||
quantifier.fit(train)
|
quantifier.fit(train)
|
||||||
|
|
||||||
print(f'saving model in {model_path}')
|
print(f'saving model in {model_path}')
|
||||||
pickle.dump(quantifier, open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
pickle.dump(quantifier, open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
pickle.dump(quantifier.get_params(), open(modelparams_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -92,7 +101,8 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('task', metavar='TASK', type=str, choices=LEQUA2024_TASKS,
|
parser.add_argument('task', metavar='TASK', type=str, choices=LEQUA2024_TASKS,
|
||||||
help=f'Code of the task; available ones are {LEQUA2024_TASKS}')
|
help=f'Code of the task; available ones are {LEQUA2024_TASKS}')
|
||||||
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
||||||
help='Path of the directory containing LeQua 2024 data', default='./data')
|
help='Path of the directory containing LeQua 2024 data (default is ./data)',
|
||||||
|
default='./data')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -21,7 +21,8 @@ def main(args):
|
||||||
# check the number of samples
|
# check the number of samples
|
||||||
nsamples = len(glob(os.path.join(args.samples, f'*.txt')))
|
nsamples = len(glob(os.path.join(args.samples, f'*.txt')))
|
||||||
if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}:
|
if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}:
|
||||||
print(f'Warning: The number of samples (.txt) in {args.samples} does neither coincide with the expected number of '
|
print(f'Warning: The number of samples (.txt) in {args.samples} '
|
||||||
|
f'does neither coincide with the expected number of '
|
||||||
f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
|
f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
|
||||||
f'test samples ({constants.TEST_SAMPLES}).')
|
f'test samples ({constants.TEST_SAMPLES}).')
|
||||||
|
|
||||||
|
@ -39,7 +40,7 @@ def main(args):
|
||||||
|
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
parser = argparse.ArgumentParser(description='LeQua2022 prediction script')
|
parser = argparse.ArgumentParser(description='LeQua2024 prediction script')
|
||||||
parser.add_argument('model', metavar='MODEL-PATH', type=str,
|
parser.add_argument('model', metavar='MODEL-PATH', type=str,
|
||||||
help='Path of saved model')
|
help='Path of saved model')
|
||||||
parser.add_argument('samples', metavar='SAMPLES-PATH', type=str,
|
parser.add_argument('samples', metavar='SAMPLES-PATH', type=str,
|
||||||
|
|
|
@ -7,29 +7,47 @@ set -x
|
||||||
# T3: ordinal (n=5)
|
# T3: ordinal (n=5)
|
||||||
# T4: covariante shift (n=2)
|
# T4: covariante shift (n=2)
|
||||||
|
|
||||||
# --------------------------------------------------------------------------------
|
# preparing the environment: downloads the official LeQua 2024 scripts (only once and for all)
|
||||||
# DEV
|
SCRIPTS_URL=https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip
|
||||||
# --------------------------------------------------------------------------------
|
|
||||||
|
# download and unzip the LeQua 2024 scripts
|
||||||
|
|
||||||
|
if [ ! -d "./scripts" ]; then
|
||||||
|
echo "Downloading $SCRIPTS_URL into ./scripts"
|
||||||
|
wget -qO scripts.zip "$SCRIPTS_URL"
|
||||||
|
unzip -q scripts.zip
|
||||||
|
mv "LeQua2024_scripts-main" "scripts"
|
||||||
|
rm scripts.zip
|
||||||
|
echo "[Done]"
|
||||||
|
else
|
||||||
|
echo "LeQua 2024 scripts already exists"
|
||||||
|
fi
|
||||||
|
|
||||||
mkdir results
|
|
||||||
|
|
||||||
for task in T1 T2 T3 T4 ; do
|
for task in T1 T2 T3 T4 ; do
|
||||||
|
|
||||||
echo "" > results/$task.txt
|
|
||||||
|
|
||||||
PYTHONPATH=.:scripts/:.. python3 baselines.py $task data/
|
PYTHONPATH=.:scripts/:.. python3 baselines.py $task data/
|
||||||
|
|
||||||
SAMPLES=data/$task/public/dev_samples
|
TEST_SAMPLES=data/lequa2024/$task/public/test_samples
|
||||||
TRUEPREVS=data/$task/public/dev_prevalences.txt
|
|
||||||
|
|
||||||
for pickledmodel in models/$task/*.pkl ; do
|
for pickledmodel in models/$task/*.pkl ; do
|
||||||
model=$(basename "$pickledmodel" .pkl)
|
model=$(basename "$pickledmodel" .pkl)
|
||||||
|
PREDICTIONS=predictions/$model/task_"${task: -1}".csv
|
||||||
PREDICTIONS=predictions/$task/$model.txt
|
PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $TEST_SAMPLES $PREDICTIONS
|
||||||
|
|
||||||
PYTHONPATH=.:scripts/:.. python3 predict.py models/$task/$model.pkl $SAMPLES $PREDICTIONS
|
|
||||||
PYTHONPATH=.:scripts/:.. python3 scripts/evaluate.py $task $TRUEPREVS $PREDICTIONS >> results/$task.txt
|
|
||||||
done
|
done
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
echo "generating submission files for codalab in folder ./submission_files"
|
||||||
|
|
||||||
|
mkdir -p submission_files
|
||||||
|
|
||||||
|
for modelname in predictions/* ; do
|
||||||
|
modelname=$(basename "$modelname")
|
||||||
|
submission_name=submission_files/$modelname.zip
|
||||||
|
rm -f $submission_name
|
||||||
|
echo "zipping results for $modelname"
|
||||||
|
zip -j $submission_name predictions/$modelname/task_*.csv
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[Done]"
|
||||||
|
|
Loading…
Reference in New Issue