adding lequa2024 datasets and example 4b

This commit is contained in:
Alejandro Moreo Fernandez 2024-07-12 09:41:40 +02:00
parent b4571d96c7
commit 5da9fa0b09
4 changed files with 216 additions and 3 deletions

View File

@ -0,0 +1,52 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024
from quapy.evaluation import evaluation_report
from quapy.method.aggregative import KDEyML
from quapy.model_selection import GridSearchQ
import pandas as pd
"""
This example shows hoy to use the LeQua datasets (new in v0.1.9). For more information about the datasets, and the
LeQua competition itself, check:
https://lequa2024.github.io/index (the site of the competition)
"""
# there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift)
task = 'T2'
# set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing:
qp.environ['SAMPLE_SIZE'] = LEQUA2024_SAMPLE_SIZE[task]
qp.environ['N_JOBS'] = -1
# the fetch method returns a training set (an instance of LabelledCollection) and two generators: one for the
# validation set and another for the test sets. These generators are both instances of classes that extend
# AbstractProtocol (i.e., classes that implement sampling generation procedures) and, in particular, are instances
# of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
# stored in a directory.
training, val_generator, test_generator = fetch_lequa2024(task=task)
# define the quantifier
quantifier = KDEyML(classifier=LogisticRegression())
# model selection
param_grid = {
'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength
'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class
'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel
}
model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
quantifier = model_selection.fit(training)
# evaluation
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
# printing results
pd.set_option('display.expand_frame_repr', False)
report['estim-prev'] = report['estim-prev'].map(F.strprev)
print(report)
print('Averaged values:')
print(report.mean())

View File

@ -4,6 +4,8 @@ import numpy as np
import os
from quapy.protocol import AbstractProtocol
from quapy.data import LabelledCollection
DEV_SAMPLES = 1000
TEST_SAMPLES = 5000
@ -12,6 +14,13 @@ ERROR_TOL = 1E-3
def load_category_map(path):
"""
Loads the category map, i.e., a mapping of numerical ids of labels with a human readable name.
:param path: path to the label map file
:return: a dictionary cat2code (i.e., cat2code[cat_name] gives access to the category id) and a list code2cat (i.e.,
code2cat[cat_id] gives access to the category name)
"""
cat2code = {}
with open(path, 'rt') as fin:
for line in fin:
@ -22,6 +31,16 @@ def load_category_map(path):
def load_raw_documents(path):
"""
Loads raw documents. In case the sample is unlabelled,
the labels returned are None
:param path: path to the data sample containing the raw documents
:return: a tuple with the documents (np.ndarray of strings of shape `(n,)`) and
the labels (a np.ndarray of shape `(n,)` if the sample is labelled,
or None if the sample is unlabelled), with `n` the number of instances in the sample
(250 for T1A, 1000 for T1B)
"""
df = pd.read_csv(path)
documents = list(df["text"].values)
labels = None
@ -30,7 +49,16 @@ def load_raw_documents(path):
return documents, labels
def load_vector_documents(path):
def load_vector_documents_2022(path):
"""
Loads vectorized documents. In case the sample is unlabelled,
the labels returned are None
:param path: path to the data sample containing the raw documents
:return: a tuple with the documents (np.ndarray of shape `(n,300)`) and the labels (a np.ndarray of shape `(n,)` if
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
(250 for T1A, 1000 for T1B)
"""
D = pd.read_csv(path).to_numpy(dtype=float)
labelled = D.shape[1] == 301
if labelled:
@ -40,6 +68,25 @@ def load_vector_documents(path):
return X, y
def load_vector_documents_2024(path):
"""
Loads vectorized documents. In case the sample is unlabelled,
the labels returned are None
:param path: path to the data sample containing the raw documents
:return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
(250 for T1 and T4, 1000 for T2, and 200 for T3)
"""
D = pd.read_csv(path).to_numpy(dtype=float)
labelled = D.shape[1] == 257
if labelled:
X, y = D[:,1:], D[:,0].astype(int).flatten()
else:
X, y = D, None
return X, y
class SamplesFromDir(AbstractProtocol):
def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
@ -53,6 +100,20 @@ class SamplesFromDir(AbstractProtocol):
yield sample, prevalence
class LabelledCollectionsFromDir(AbstractProtocol):
def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
self.path_dir = path_dir
self.load_fn = load_fn
self.true_prevs = pd.read_csv(ground_truth_path, index_col=0)
def __call__(self):
for id, prevalence in self.true_prevs.iterrows():
collection_path = os.path.join(self.path_dir, f'{id}.txt')
lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn)
yield lc
class ResultSubmission:
def __init__(self):

View File

@ -85,6 +85,8 @@ LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS
LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
_TXA_SAMPLE_SIZE = 250
_TXB_SAMPLE_SIZE = 1000
@ -99,6 +101,13 @@ LEQUA2022_SAMPLE_SIZE = {
'multiclass': _TXB_SAMPLE_SIZE
}
LEQUA2024_SAMPLE_SIZE = {
'T1': 250,
'T2': 1000,
'T3': 200,
'T4': 250,
}
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
@ -806,7 +815,7 @@ def fetch_lequa2022(task, data_home=None):
that return a series of samples stored in a directory which are labelled by prevalence.
"""
from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
from quapy.data._lequa import load_raw_documents, load_vector_documents_2022, SamplesFromDir
assert task in LEQUA2022_TASKS, \
f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
@ -833,7 +842,7 @@ def fetch_lequa2022(task, data_home=None):
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
if task in ['T1A', 'T1B']:
load_fn = load_vector_documents
load_fn = load_vector_documents_2022
elif task in ['T2A', 'T2B']:
load_fn = load_raw_documents
@ -851,6 +860,65 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen
def fetch_lequa2024(task, data_home=None, merge_T3=False):
from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir
assert task in LEQUA2024_TASKS, \
f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
if data_home is None:
data_home = get_quapy_home()
lequa_dir = data_home
LEQUA2024_ZENODO = 'https://zenodo.org/records/11661820' # v3, last one with labels
URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip'
URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip'
URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip'
lequa_dir = join(data_home, 'lequa2024')
os.makedirs(lequa_dir, exist_ok=True)
def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(lequa_dir, task + '_tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)
if not os.path.exists(join(lequa_dir, task)):
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
download_unzip_and_remove(lequa_dir, URL_TEST)
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
load_fn = load_vector_documents_2024
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
if task != 'T3':
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
train = LabelledCollection.load(tr_path, loader_func=load_fn)
return train, val_gen, test_gen
else:
training_samples_path = join(lequa_dir, task, 'public', 'training_samples')
training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt')
train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn)
if merge_T3:
train = LabelledCollection.join(*list(train_gen()))
return train, val_gen, test_gen
else:
return train_gen, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
"""
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more

View File

@ -285,6 +285,36 @@ def mnrae(prevs, prevs_hat, eps=None):
return nrae(prevs, prevs_hat, eps).mean()
def nmd(prevs, prevs_hat):
"""
Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor
`1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction).
:param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
:return: float in [0,1]
"""
n = prevs.shape[-1]
return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))
def md(prevs, prevs_hat, ERROR_TOL=1E-3):
"""
Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
all cases.
:param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
:return: float
"""
P = np.cumsum(prevs, axis=-1)
P_hat = np.cumsum(prevs_hat, axis=-1)
assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \
'arg error in match_distance: the array does not represent a valid distribution'
distances = np.abs(P-P_hat)
return distances[..., :-1].sum(axis=-1)
def smooth(prevs, eps):
""" Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as:
:math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+
@ -328,3 +358,5 @@ normalized_absolute_error = nae
normalized_relative_absolute_error = nrae
mean_normalized_absolute_error = mnae
mean_normalized_relative_absolute_error = mnrae
normalized_match_distance = nmd
match_distance = md