From 5da9fa0b097fffc9b90c583e651591a4a227a116 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 12 Jul 2024 09:41:40 +0200 Subject: [PATCH] adding lequa2024 datasets and example 4b --- examples/4b.lequa2024_experiments.py | 52 ++++++++++++++++++ quapy/data/{_lequa2022.py => _lequa.py} | 63 +++++++++++++++++++++- quapy/data/datasets.py | 72 ++++++++++++++++++++++++- quapy/error.py | 32 +++++++++++ 4 files changed, 216 insertions(+), 3 deletions(-) create mode 100644 examples/4b.lequa2024_experiments.py rename quapy/data/{_lequa2022.py => _lequa.py} (71%) diff --git a/examples/4b.lequa2024_experiments.py b/examples/4b.lequa2024_experiments.py new file mode 100644 index 0000000..4ce5a43 --- /dev/null +++ b/examples/4b.lequa2024_experiments.py @@ -0,0 +1,52 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +import quapy as qp +import quapy.functional as F +from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024 +from quapy.evaluation import evaluation_report +from quapy.method.aggregative import KDEyML +from quapy.model_selection import GridSearchQ +import pandas as pd + +""" +This example shows hoy to use the LeQua datasets (new in v0.1.9). For more information about the datasets, and the +LeQua competition itself, check: +https://lequa2024.github.io/index (the site of the competition) +""" + +# there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift) +task = 'T2' + +# set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing: +qp.environ['SAMPLE_SIZE'] = LEQUA2024_SAMPLE_SIZE[task] +qp.environ['N_JOBS'] = -1 + +# the fetch method returns a training set (an instance of LabelledCollection) and two generators: one for the +# validation set and another for the test sets. These generators are both instances of classes that extend +# AbstractProtocol (i.e., classes that implement sampling generation procedures) and, in particular, are instances +# of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition) +# stored in a directory. +training, val_generator, test_generator = fetch_lequa2024(task=task) + +# define the quantifier +quantifier = KDEyML(classifier=LogisticRegression()) + +# model selection +param_grid = { + 'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength + 'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class + 'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel +} +model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True) +quantifier = model_selection.fit(training) + +# evaluation +report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True) + +# printing results +pd.set_option('display.expand_frame_repr', False) +report['estim-prev'] = report['estim-prev'].map(F.strprev) +print(report) + +print('Averaged values:') +print(report.mean()) diff --git a/quapy/data/_lequa2022.py b/quapy/data/_lequa.py similarity index 71% rename from quapy/data/_lequa2022.py rename to quapy/data/_lequa.py index 449eab6..e162f4c 100644 --- a/quapy/data/_lequa2022.py +++ b/quapy/data/_lequa.py @@ -4,6 +4,8 @@ import numpy as np import os from quapy.protocol import AbstractProtocol +from quapy.data import LabelledCollection + DEV_SAMPLES = 1000 TEST_SAMPLES = 5000 @@ -12,6 +14,13 @@ ERROR_TOL = 1E-3 def load_category_map(path): + """ + Loads the category map, i.e., a mapping of numerical ids of labels with a human readable name. + + :param path: path to the label map file + :return: a dictionary cat2code (i.e., cat2code[cat_name] gives access to the category id) and a list code2cat (i.e., + code2cat[cat_id] gives access to the category name) + """ cat2code = {} with open(path, 'rt') as fin: for line in fin: @@ -22,6 +31,16 @@ def load_category_map(path): def load_raw_documents(path): + """ + Loads raw documents. In case the sample is unlabelled, + the labels returned are None + + :param path: path to the data sample containing the raw documents + :return: a tuple with the documents (np.ndarray of strings of shape `(n,)`) and + the labels (a np.ndarray of shape `(n,)` if the sample is labelled, + or None if the sample is unlabelled), with `n` the number of instances in the sample + (250 for T1A, 1000 for T1B) + """ df = pd.read_csv(path) documents = list(df["text"].values) labels = None @@ -30,7 +49,16 @@ def load_raw_documents(path): return documents, labels -def load_vector_documents(path): +def load_vector_documents_2022(path): + """ + Loads vectorized documents. In case the sample is unlabelled, + the labels returned are None + + :param path: path to the data sample containing the raw documents + :return: a tuple with the documents (np.ndarray of shape `(n,300)`) and the labels (a np.ndarray of shape `(n,)` if + the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample + (250 for T1A, 1000 for T1B) + """ D = pd.read_csv(path).to_numpy(dtype=float) labelled = D.shape[1] == 301 if labelled: @@ -40,6 +68,25 @@ def load_vector_documents(path): return X, y +def load_vector_documents_2024(path): + """ + Loads vectorized documents. In case the sample is unlabelled, + the labels returned are None + + :param path: path to the data sample containing the raw documents + :return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if + the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample + (250 for T1 and T4, 1000 for T2, and 200 for T3) + """ + D = pd.read_csv(path).to_numpy(dtype=float) + labelled = D.shape[1] == 257 + if labelled: + X, y = D[:,1:], D[:,0].astype(int).flatten() + else: + X, y = D, None + return X, y + + class SamplesFromDir(AbstractProtocol): def __init__(self, path_dir:str, ground_truth_path:str, load_fn): @@ -53,6 +100,20 @@ class SamplesFromDir(AbstractProtocol): yield sample, prevalence +class LabelledCollectionsFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, ground_truth_path:str, load_fn): + self.path_dir = path_dir + self.load_fn = load_fn + self.true_prevs = pd.read_csv(ground_truth_path, index_col=0) + + def __call__(self): + for id, prevalence in self.true_prevs.iterrows(): + collection_path = os.path.join(self.path_dir, f'{id}.txt') + lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn) + yield lc + + class ResultSubmission: def __init__(self): diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 1daea64..451651c 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -85,6 +85,8 @@ LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B'] LEQUA2022_TEXT_TASKS = ['T2A', 'T2B'] LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS +LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4'] + _TXA_SAMPLE_SIZE = 250 _TXB_SAMPLE_SIZE = 1000 @@ -99,6 +101,13 @@ LEQUA2022_SAMPLE_SIZE = { 'multiclass': _TXB_SAMPLE_SIZE } +LEQUA2024_SAMPLE_SIZE = { + 'T1': 250, + 'T2': 1000, + 'T3': 200, + 'T4': 250, +} + def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: """ @@ -806,7 +815,7 @@ def fetch_lequa2022(task, data_home=None): that return a series of samples stored in a directory which are labelled by prevalence. """ - from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir + from quapy.data._lequa import load_raw_documents, load_vector_documents_2022, SamplesFromDir assert task in LEQUA2022_TASKS, \ f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}' @@ -833,7 +842,7 @@ def fetch_lequa2022(task, data_home=None): download_unzip_and_remove(lequa_dir, URL_TEST_PREV) if task in ['T1A', 'T1B']: - load_fn = load_vector_documents + load_fn = load_vector_documents_2022 elif task in ['T2A', 'T2B']: load_fn = load_raw_documents @@ -851,6 +860,65 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen +def fetch_lequa2024(task, data_home=None, merge_T3=False): + + from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir + + assert task in LEQUA2024_TASKS, \ + f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}' + + if data_home is None: + data_home = get_quapy_home() + + lequa_dir = data_home + + LEQUA2024_ZENODO = 'https://zenodo.org/records/11661820' # v3, last one with labels + + URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip' + URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip' + URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip' + + lequa_dir = join(data_home, 'lequa2024') + os.makedirs(lequa_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(lequa_dir, task + '_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(join(lequa_dir, task)): + download_unzip_and_remove(lequa_dir, URL_TRAINDEV) + download_unzip_and_remove(lequa_dir, URL_TEST) + download_unzip_and_remove(lequa_dir, URL_TEST_PREV) + + load_fn = load_vector_documents_2024 + + val_samples_path = join(lequa_dir, task, 'public', 'dev_samples') + val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt') + val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn) + + test_samples_path = join(lequa_dir, task, 'public', 'test_samples') + test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt') + test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn) + + if task != 'T3': + tr_path = join(lequa_dir, task, 'public', 'training_data.txt') + train = LabelledCollection.load(tr_path, loader_func=load_fn) + return train, val_gen, test_gen + else: + training_samples_path = join(lequa_dir, task, 'public', 'training_samples') + training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt') + train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn) + if merge_T3: + train = LabelledCollection.join(*list(train_gen())) + return train, val_gen, test_gen + else: + return train_gen, val_gen, test_gen + + + def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): """ Loads the IFCB dataset for quantification from `Zenodo `_ (for more diff --git a/quapy/error.py b/quapy/error.py index 3e21333..f867d5c 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -285,6 +285,36 @@ def mnrae(prevs, prevs_hat, eps=None): return nrae(prevs, prevs_hat, eps).mean() +def nmd(prevs, prevs_hat): + """ + Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor + `1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction). + + :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values + :return: float in [0,1] + """ + n = prevs.shape[-1] + return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat)) + + +def md(prevs, prevs_hat, ERROR_TOL=1E-3): + """ + Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in + all cases. + + :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values + :return: float + """ + P = np.cumsum(prevs, axis=-1) + P_hat = np.cumsum(prevs_hat, axis=-1) + assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \ + 'arg error in match_distance: the array does not represent a valid distribution' + distances = np.abs(P-P_hat) + return distances[..., :-1].sum(axis=-1) + + def smooth(prevs, eps): """ Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as: :math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+ @@ -328,3 +358,5 @@ normalized_absolute_error = nae normalized_relative_absolute_error = nrae mean_normalized_absolute_error = mnae mean_normalized_relative_absolute_error = mnrae +normalized_match_distance = nmd +match_distance = md