adding lequa2024 datasets and example 4b
This commit is contained in:
parent
b4571d96c7
commit
5da9fa0b09
|
@ -0,0 +1,52 @@
|
|||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024
|
||||
from quapy.evaluation import evaluation_report
|
||||
from quapy.method.aggregative import KDEyML
|
||||
from quapy.model_selection import GridSearchQ
|
||||
import pandas as pd
|
||||
|
||||
"""
|
||||
This example shows hoy to use the LeQua datasets (new in v0.1.9). For more information about the datasets, and the
|
||||
LeQua competition itself, check:
|
||||
https://lequa2024.github.io/index (the site of the competition)
|
||||
"""
|
||||
|
||||
# there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift)
|
||||
task = 'T2'
|
||||
|
||||
# set the sample size in the environment. The sample size is task-dendendent and can be consulted by doing:
|
||||
qp.environ['SAMPLE_SIZE'] = LEQUA2024_SAMPLE_SIZE[task]
|
||||
qp.environ['N_JOBS'] = -1
|
||||
|
||||
# the fetch method returns a training set (an instance of LabelledCollection) and two generators: one for the
|
||||
# validation set and another for the test sets. These generators are both instances of classes that extend
|
||||
# AbstractProtocol (i.e., classes that implement sampling generation procedures) and, in particular, are instances
|
||||
# of SamplesFromDir, a protocol that simply iterates over pre-generated samples (those provided for the competition)
|
||||
# stored in a directory.
|
||||
training, val_generator, test_generator = fetch_lequa2024(task=task)
|
||||
|
||||
# define the quantifier
|
||||
quantifier = KDEyML(classifier=LogisticRegression())
|
||||
|
||||
# model selection
|
||||
param_grid = {
|
||||
'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength
|
||||
'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class
|
||||
'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel
|
||||
}
|
||||
model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
|
||||
quantifier = model_selection.fit(training)
|
||||
|
||||
# evaluation
|
||||
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
|
||||
|
||||
# printing results
|
||||
pd.set_option('display.expand_frame_repr', False)
|
||||
report['estim-prev'] = report['estim-prev'].map(F.strprev)
|
||||
print(report)
|
||||
|
||||
print('Averaged values:')
|
||||
print(report.mean())
|
|
@ -4,6 +4,8 @@ import numpy as np
|
|||
import os
|
||||
|
||||
from quapy.protocol import AbstractProtocol
|
||||
from quapy.data import LabelledCollection
|
||||
|
||||
|
||||
DEV_SAMPLES = 1000
|
||||
TEST_SAMPLES = 5000
|
||||
|
@ -12,6 +14,13 @@ ERROR_TOL = 1E-3
|
|||
|
||||
|
||||
def load_category_map(path):
|
||||
"""
|
||||
Loads the category map, i.e., a mapping of numerical ids of labels with a human readable name.
|
||||
|
||||
:param path: path to the label map file
|
||||
:return: a dictionary cat2code (i.e., cat2code[cat_name] gives access to the category id) and a list code2cat (i.e.,
|
||||
code2cat[cat_id] gives access to the category name)
|
||||
"""
|
||||
cat2code = {}
|
||||
with open(path, 'rt') as fin:
|
||||
for line in fin:
|
||||
|
@ -22,6 +31,16 @@ def load_category_map(path):
|
|||
|
||||
|
||||
def load_raw_documents(path):
|
||||
"""
|
||||
Loads raw documents. In case the sample is unlabelled,
|
||||
the labels returned are None
|
||||
|
||||
:param path: path to the data sample containing the raw documents
|
||||
:return: a tuple with the documents (np.ndarray of strings of shape `(n,)`) and
|
||||
the labels (a np.ndarray of shape `(n,)` if the sample is labelled,
|
||||
or None if the sample is unlabelled), with `n` the number of instances in the sample
|
||||
(250 for T1A, 1000 for T1B)
|
||||
"""
|
||||
df = pd.read_csv(path)
|
||||
documents = list(df["text"].values)
|
||||
labels = None
|
||||
|
@ -30,7 +49,16 @@ def load_raw_documents(path):
|
|||
return documents, labels
|
||||
|
||||
|
||||
def load_vector_documents(path):
|
||||
def load_vector_documents_2022(path):
|
||||
"""
|
||||
Loads vectorized documents. In case the sample is unlabelled,
|
||||
the labels returned are None
|
||||
|
||||
:param path: path to the data sample containing the raw documents
|
||||
:return: a tuple with the documents (np.ndarray of shape `(n,300)`) and the labels (a np.ndarray of shape `(n,)` if
|
||||
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
|
||||
(250 for T1A, 1000 for T1B)
|
||||
"""
|
||||
D = pd.read_csv(path).to_numpy(dtype=float)
|
||||
labelled = D.shape[1] == 301
|
||||
if labelled:
|
||||
|
@ -40,6 +68,25 @@ def load_vector_documents(path):
|
|||
return X, y
|
||||
|
||||
|
||||
def load_vector_documents_2024(path):
|
||||
"""
|
||||
Loads vectorized documents. In case the sample is unlabelled,
|
||||
the labels returned are None
|
||||
|
||||
:param path: path to the data sample containing the raw documents
|
||||
:return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
|
||||
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
|
||||
(250 for T1 and T4, 1000 for T2, and 200 for T3)
|
||||
"""
|
||||
D = pd.read_csv(path).to_numpy(dtype=float)
|
||||
labelled = D.shape[1] == 257
|
||||
if labelled:
|
||||
X, y = D[:,1:], D[:,0].astype(int).flatten()
|
||||
else:
|
||||
X, y = D, None
|
||||
return X, y
|
||||
|
||||
|
||||
class SamplesFromDir(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
|
||||
|
@ -53,6 +100,20 @@ class SamplesFromDir(AbstractProtocol):
|
|||
yield sample, prevalence
|
||||
|
||||
|
||||
class LabelledCollectionsFromDir(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
|
||||
self.path_dir = path_dir
|
||||
self.load_fn = load_fn
|
||||
self.true_prevs = pd.read_csv(ground_truth_path, index_col=0)
|
||||
|
||||
def __call__(self):
|
||||
for id, prevalence in self.true_prevs.iterrows():
|
||||
collection_path = os.path.join(self.path_dir, f'{id}.txt')
|
||||
lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn)
|
||||
yield lc
|
||||
|
||||
|
||||
class ResultSubmission:
|
||||
|
||||
def __init__(self):
|
|
@ -85,6 +85,8 @@ LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
|
|||
LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
|
||||
LEQUA2022_TASKS = LEQUA2022_VECTOR_TASKS + LEQUA2022_TEXT_TASKS
|
||||
|
||||
LEQUA2024_TASKS = ['T1', 'T2', 'T3', 'T4']
|
||||
|
||||
_TXA_SAMPLE_SIZE = 250
|
||||
_TXB_SAMPLE_SIZE = 1000
|
||||
|
||||
|
@ -99,6 +101,13 @@ LEQUA2022_SAMPLE_SIZE = {
|
|||
'multiclass': _TXB_SAMPLE_SIZE
|
||||
}
|
||||
|
||||
LEQUA2024_SAMPLE_SIZE = {
|
||||
'T1': 250,
|
||||
'T2': 1000,
|
||||
'T3': 200,
|
||||
'T4': 250,
|
||||
}
|
||||
|
||||
|
||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||
"""
|
||||
|
@ -806,7 +815,7 @@ def fetch_lequa2022(task, data_home=None):
|
|||
that return a series of samples stored in a directory which are labelled by prevalence.
|
||||
"""
|
||||
|
||||
from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
|
||||
from quapy.data._lequa import load_raw_documents, load_vector_documents_2022, SamplesFromDir
|
||||
|
||||
assert task in LEQUA2022_TASKS, \
|
||||
f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
|
||||
|
@ -833,7 +842,7 @@ def fetch_lequa2022(task, data_home=None):
|
|||
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
||||
|
||||
if task in ['T1A', 'T1B']:
|
||||
load_fn = load_vector_documents
|
||||
load_fn = load_vector_documents_2022
|
||||
elif task in ['T2A', 'T2B']:
|
||||
load_fn = load_raw_documents
|
||||
|
||||
|
@ -851,6 +860,65 @@ def fetch_lequa2022(task, data_home=None):
|
|||
return train, val_gen, test_gen
|
||||
|
||||
|
||||
def fetch_lequa2024(task, data_home=None, merge_T3=False):
|
||||
|
||||
from quapy.data._lequa import load_vector_documents_2024, SamplesFromDir, LabelledCollectionsFromDir
|
||||
|
||||
assert task in LEQUA2024_TASKS, \
|
||||
f'Unknown task {task}. Valid ones are {LEQUA2024_TASKS}'
|
||||
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
lequa_dir = data_home
|
||||
|
||||
LEQUA2024_ZENODO = 'https://zenodo.org/records/11661820' # v3, last one with labels
|
||||
|
||||
URL_TRAINDEV=f'{LEQUA2024_ZENODO}/files/{task}.train_dev.zip'
|
||||
URL_TEST=f'{LEQUA2024_ZENODO}/files/{task}.test.zip'
|
||||
URL_TEST_PREV=f'{LEQUA2024_ZENODO}/files/{task}.test_prevalences.zip'
|
||||
|
||||
lequa_dir = join(data_home, 'lequa2024')
|
||||
os.makedirs(lequa_dir, exist_ok=True)
|
||||
|
||||
def download_unzip_and_remove(unzipped_path, url):
|
||||
tmp_path = join(lequa_dir, task + '_tmp.zip')
|
||||
download_file_if_not_exists(url, tmp_path)
|
||||
with zipfile.ZipFile(tmp_path) as file:
|
||||
file.extractall(unzipped_path)
|
||||
os.remove(tmp_path)
|
||||
|
||||
if not os.path.exists(join(lequa_dir, task)):
|
||||
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
|
||||
download_unzip_and_remove(lequa_dir, URL_TEST)
|
||||
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
||||
|
||||
load_fn = load_vector_documents_2024
|
||||
|
||||
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
|
||||
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
|
||||
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
|
||||
|
||||
test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
|
||||
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
|
||||
test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
|
||||
|
||||
if task != 'T3':
|
||||
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
|
||||
train = LabelledCollection.load(tr_path, loader_func=load_fn)
|
||||
return train, val_gen, test_gen
|
||||
else:
|
||||
training_samples_path = join(lequa_dir, task, 'public', 'training_samples')
|
||||
training_true_prev_path = join(lequa_dir, task, 'public', 'training_prevalences.txt')
|
||||
train_gen = LabelledCollectionsFromDir(training_samples_path, training_true_prev_path, load_fn=load_fn)
|
||||
if merge_T3:
|
||||
train = LabelledCollection.join(*list(train_gen()))
|
||||
return train, val_gen, test_gen
|
||||
else:
|
||||
return train_gen, val_gen, test_gen
|
||||
|
||||
|
||||
|
||||
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
|
||||
"""
|
||||
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
|
||||
|
|
|
@ -285,6 +285,36 @@ def mnrae(prevs, prevs_hat, eps=None):
|
|||
return nrae(prevs, prevs_hat, eps).mean()
|
||||
|
||||
|
||||
def nmd(prevs, prevs_hat):
|
||||
"""
|
||||
Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor
|
||||
`1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction).
|
||||
|
||||
:param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values
|
||||
:param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
|
||||
:return: float in [0,1]
|
||||
"""
|
||||
n = prevs.shape[-1]
|
||||
return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))
|
||||
|
||||
|
||||
def md(prevs, prevs_hat, ERROR_TOL=1E-3):
|
||||
"""
|
||||
Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
|
||||
all cases.
|
||||
|
||||
:param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values
|
||||
:param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values
|
||||
:return: float
|
||||
"""
|
||||
P = np.cumsum(prevs, axis=-1)
|
||||
P_hat = np.cumsum(prevs_hat, axis=-1)
|
||||
assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \
|
||||
'arg error in match_distance: the array does not represent a valid distribution'
|
||||
distances = np.abs(P-P_hat)
|
||||
return distances[..., :-1].sum(axis=-1)
|
||||
|
||||
|
||||
def smooth(prevs, eps):
|
||||
""" Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as:
|
||||
:math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+
|
||||
|
@ -328,3 +358,5 @@ normalized_absolute_error = nae
|
|||
normalized_relative_absolute_error = nrae
|
||||
mean_normalized_absolute_error = mnae
|
||||
mean_normalized_relative_absolute_error = mnrae
|
||||
normalized_match_distance = nmd
|
||||
match_distance = md
|
||||
|
|
Loading…
Reference in New Issue