From 5da9fa0b097fffc9b90c583e651591a4a227a116 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 12 Jul 2024 09:41:40 +0200
Subject: [PATCH] adding lequa2024 datasets and example 4b

---
 examples/4b.lequa2024_experiments.py    | 52 ++++++++++++++++++
 quapy/data/{_lequa2022.py => _lequa.py} | 63 +++++++++++++++++++++-
 quapy/data/datasets.py                  | 72 ++++++++++++++++++++++++-
 quapy/error.py                          | 32 +++++++++++
 4 files changed, 216 insertions(+), 3 deletions(-)
 create mode 100644 examples/4b.lequa2024_experiments.py
 rename quapy/data/{_lequa2022.py => _lequa.py} (71%)

diff --git a/examples/4b.lequa2024_experiments.py b/examples/4b.lequa2024_experiments.py
new file mode 100644
index 0000000..4ce5a43
--- /dev/null
+++ b/examples/4b.lequa2024_experiments.py
@@ -0,0 +1,52 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import quapy as qp
+import quapy.functional as F
+from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024
+from quapy.evaluation import evaluation_report
+from quapy.method.aggregative import KDEyML
+from quapy.model_selection import GridSearchQ
+import pandas as pd
+
+"""
+This example shows hoy to use the LeQua datasets (new in v0.1.9). For more information about the datasets, and the
+LeQua competition itself, check:
+https://lequa2024.github.io/index (the site of the competition)
+"""
+
+# there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift)
+task = 'T2'
+
+# set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing:
+qp.environ['SAMPLE_SIZE'] = LEQUA2024_SAMPLE_SIZE[task]
+qp.environ['N_JOBS'] = -1
+
+# the fetch method returns a training set (an instance of LabelledCollection) and two generators: one for the
+# validation set and another for the test sets. These generators are both instances of classes that extend
+# AbstractProtocol (i.e., classes that implement sampling generation procedures) and, in particular, are instances
+# of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
+# stored in a directory.
+training, val_generator, test_generator = fetch_lequa2024(task=task)
+
+# define the quantifier
+quantifier = KDEyML(classifier=LogisticRegression())
+
+# model selection
+param_grid = {
+    'classifier__C': np.logspace(-3, 3, 7),          # classifier-dependent: inverse of regularization strength
+    'classifier__class_weight': ['balanced', None],  # classifier-dependent: weights of each class
+    'bandwidth': np.linspace(0.01, 0.2, 20)          # quantifier-dependent: bandwidth of the kernel
+}
+model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
+quantifier = model_selection.fit(training)
+
+# evaluation
+report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
+
+# printing results
+pd.set_option('display.expand_frame_repr', False)
+report['estim-prev'] = report['estim-prev'].map(F.strprev)
+print(report)
+
+print('Averaged values:')
+print(report.mean())
diff --git a/quapy/data/_lequa2022.py b/quapy/data/_lequa.py
similarity index 71%
rename from quapy/data/_lequa2022.py
rename to quapy/data/_lequa.py
index 449eab6..e162f4c 100644
--- a/quapy/data/_lequa2022.py
+++ b/quapy/data/_lequa.py
@@ -4,6 +4,8 @@ import numpy as np
 import os
 
 from quapy.protocol import AbstractProtocol
+from quapy.data import LabelledCollection
+
 
 DEV_SAMPLES = 1000
 TEST_SAMPLES = 5000
@@ -12,6 +14,13 @@ ERROR_TOL = 1E-3
 
 
 def load_category_map(path):
+    """
+    Loads the category map, i.e., a mapping of numerical ids of labels with a human readable name.
+
+    :param path: path to the label map file
+    :return: a dictionary cat2code (i.e., cat2code[cat_name] gives access to the category id) and a list code2cat (i.e.,
+        code2cat[cat_id] gives access to the category name)
+    """
     cat2code = {}
     with open(path, 'rt') as fin:
         for line in fin:
@@ -22,6 +31,16 @@ def load_category_map(path):
 
 
 def load_raw_documents(path):
+    """
+    Loads raw documents. In case the sample is unlabelled,
+    the labels returned are None
+
+    :param path: path to the data sample containing the raw documents
+    :return: a tuple with the documents (np.ndarray of strings of shape `(n,)`) and
+        the labels (a np.ndarray of shape `(n,)` if the sample is labelled,
+        or None if the sample is unlabelled), with `n` the number of instances in the sample
+        (250 for T1A, 1000 for T1B)
+    """
     df = pd.read_csv(path)
     documents = list(df["text"].values)
     labels = None
@@ -30,7 +49,16 @@ def load_raw_documents(path):
     return documents, labels
 
 
-def load_vector_documents(path):
+def load_vector_documents_2022(path):
+    """
+    Loads vectorized documents. In case the sample is unlabelled,
+    the labels returned are None
+
+    :param path: path to the data sample containing the raw documents
+    :return: a tuple with the documents (np.ndarray of shape `(n,300)`) and the labels (a np.ndarray of shape `(n,)` if
+        the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
+        (250 for T1A, 1000 for T1B)
+    """
     D = pd.read_csv(path).to_numpy(dtype=float)
     labelled = D.shape[1] == 301
     if labelled:
@@ -40,6 +68,25 @@ def load_vector_documents(path):
     return X, y
 
 
+def load_vector_documents_2024(path):
+    """
+    Loads vectorized documents. In case the sample is unlabelled,
+    the labels returned are None
+
+    :param path: path to the data sample containing the raw documents
+    :return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
+        the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
+        (250 for T1 and T4, 1000 for T2, and 200 for T3)
+    """
+    D = pd.read_csv(path).to_numpy(dtype=float)
+    labelled = D.shape[1] == 257
+    if labelled:
+        X, y = D[:,1:], D[:,0].astype(int).flatten()
+    else:
+        X, y = D, None
+    return X, y
+
+
 class SamplesFromDir(AbstractProtocol):
 
     def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
@@ -53,6 +100,20 @@ class SamplesFromDir(AbstractProtocol):
             yield sample, prevalence
 
 
+class LabelledCollectionsFromDir(AbstractProtocol):
+
+    def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.true_prevs = pd.read_csv(ground_truth_path, index_col=0)
+
+    def __call__(self):
+        for id, prevalence in self.true_prevs.iterrows():
+            collection_path = os.path.join(self.path_dir, f'{id}.txt')
+            lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn)
+            yield lc
+
+
 class ResultSubmission:
 
     def __init__(self):
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 1daea64..451651c 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -85,6 +85,8 @@ LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
 LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
 LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS
 
+LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
+
 _TXA_SAMPLE_SIZE = 250
 _TXB_SAMPLE_SIZE = 1000
 
@@ -99,6 +101,13 @@ LEQUA2022_SAMPLE_SIZE = {
     'multiclass': _TXB_SAMPLE_SIZE
 }
 
+LEQUA2024_SAMPLE_SIZE = {
+    'T1': 250,
+    'T2': 1000,
+    'T3': 200,
+    'T4': 250,
+}
+
 
 def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
     """
@@ -806,7 +815,7 @@ def fetch_lequa2022(task, data_home=None):
         that return a series of samples stored in a directory which are labelled by prevalence.
     """
 
-    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
+    from quapy.data._lequa import load_raw_documents, load_vector_documents_2022, SamplesFromDir
 
     assert task in LEQUA2022_TASKS, \
         f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
@@ -833,7 +842,7 @@ def fetch_lequa2022(task, data_home=None):
         download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
 
     if task in ['T1A', 'T1B']:
-        load_fn = load_vector_documents
+        load_fn = load_vector_documents_2022
     elif task in ['T2A', 'T2B']:
         load_fn = load_raw_documents
 
@@ -851,6 +860,65 @@ def fetch_lequa2022(task, data_home=None):
     return train, val_gen, test_gen
 
 
+def fetch_lequa2024(task, data_home=None, merge_T3=False):
+
+    from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir
+
+    assert task in LEQUA2024_TASKS, \
+        f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
+
+    if data_home is None:
+        data_home = get_quapy_home()
+
+    lequa_dir = data_home
+
+    LEQUA2024_ZENODO = 'https://zenodo.org/records/11661820'  # v3, last one with labels
+
+    URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip'
+    URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip'
+    URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip'
+
+    lequa_dir = join(data_home, 'lequa2024')
+    os.makedirs(lequa_dir, exist_ok=True)
+
+    def download_unzip_and_remove(unzipped_path, url):
+        tmp_path = join(lequa_dir, task + '_tmp.zip')
+        download_file_if_not_exists(url, tmp_path)
+        with zipfile.ZipFile(tmp_path) as file:
+            file.extractall(unzipped_path)
+        os.remove(tmp_path)
+
+    if not os.path.exists(join(lequa_dir, task)):
+        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
+        download_unzip_and_remove(lequa_dir, URL_TEST)
+        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
+
+    load_fn = load_vector_documents_2024
+
+    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
+    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
+    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
+
+    test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
+    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
+    test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
+
+    if task != 'T3':
+        tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
+        train = LabelledCollection.load(tr_path, loader_func=load_fn)
+        return train, val_gen, test_gen
+    else:
+        training_samples_path = join(lequa_dir, task, 'public', 'training_samples')
+        training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt')
+        train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn)
+        if merge_T3:
+            train = LabelledCollection.join(*list(train_gen()))
+            return train, val_gen, test_gen
+        else:
+            return train_gen, val_gen, test_gen
+
+
+
 def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
     """
     Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
diff --git a/quapy/error.py b/quapy/error.py
index 3e21333..f867d5c 100644
--- a/quapy/error.py
+++ b/quapy/error.py
@@ -285,6 +285,36 @@ def mnrae(prevs, prevs_hat, eps=None):
     return nrae(prevs, prevs_hat, eps).mean()
 
 
+def nmd(prevs, prevs_hat):
+    """
+    Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor
+    `1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction).
+
+    :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)`  with the true prevalence values
+    :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
+    :return: float in [0,1]
+    """
+    n = prevs.shape[-1]
+    return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))
+
+
+def md(prevs, prevs_hat, ERROR_TOL=1E-3):
+    """
+    Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
+    all cases.
+
+    :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)`  with the true prevalence values
+    :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
+    :return: float
+    """
+    P = np.cumsum(prevs, axis=-1)
+    P_hat = np.cumsum(prevs_hat, axis=-1)
+    assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \
+        'arg error in match_distance: the array does not represent a valid distribution'
+    distances = np.abs(P-P_hat)
+    return distances[..., :-1].sum(axis=-1)
+
+
 def smooth(prevs, eps):
     """ Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as:
     :math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+
@@ -328,3 +358,5 @@ normalized_absolute_error = nae
 normalized_relative_absolute_error = nrae
 mean_normalized_absolute_error = mnae
 mean_normalized_relative_absolute_error = mnrae
+normalized_match_distance = nmd
+match_distance = md