From a8230827e2164beca29c33809be205453ce58322 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Thu, 8 Feb 2024 14:33:22 +0100
Subject: [PATCH] testing IFCB dataset
docs/build/html/ | 28 ++++++++--------
docs/build/html/quapy.method.html | 3 --
docs/build/html/searchindex.js | 2 +-
examples/ | 54 +++++++++++++++++++++----------
quapy/data/ | 4 +++
quapy/data/ | 11 ++++---
quapy/method/ | 17 +++++++++-
7 files changed, 78 insertions(+), 41 deletions(-)
@@ -627,30 +626,31 @@ otherwise. module fetch_IFCB ( single_sample_train = True , data_home = None ) [source]
diff --git a/examples/ b/examples/
index bf73f10..8fb39d1 100644
--- a/examples/
+++ b/examples/
@@ -1,29 +1,49 @@
+import numpy as np
import quapy as qp
from sklearn.linear_model import LogisticRegression
+from quapy.model_selection import GridSearchQ
from quapy.evaluation import evaluation_report
-def newLR():
- return LogisticRegression(n_jobs=-1)
+print('Quantifying the IFCB dataset with PACC\n')
+# model selection
+print('loading dataset for model selection...', end='')
+train, val_gen = qp.datasets.fetch_IFCB(for_model_selection=True, single_sample_train=True)
+print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
+print(f'\tvalidation samples={}')
-quantifiers = [
- ('CC', qp.method.aggregative.CC(newLR())),
- ('ACC', qp.method.aggregative.ACC(newLR())),
- ('PCC', qp.method.aggregative.PCC(newLR())),
- ('PACC', qp.method.aggregative.PACC(newLR())),
- ('HDy', qp.method.aggregative.DMy(newLR())),
- ('EMQ', qp.method.aggregative.EMQ(newLR()))
+print('model selection starts')
+quantifier = qp.method.aggregative.PACC(LogisticRegression())
+mod_sel = GridSearchQ(
+ quantifier,
+ param_grid={
+ 'classifier__C': np.logspace(-3,3,7),
+ 'classifier__class_weight': [None, 'balanced']
+ },
+ protocol=val_gen,
+ refit=False,
+ n_jobs=-1,
+ verbose=True,
+ raise_errors=True
-for quant_name, quantifier in quantifiers:
+print(f'model selection chose hyperparameters: {mod_sel.best_params_}')
+quantifier = mod_sel.best_model_
- print("Experiment with "+quant_name)
+print('loading dataset for test...', end='')
+train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sample_train=True)
+print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
+print(f'\ttest samples={}')
- train, test_gen = qp.datasets.fetch_IFCB()
+print('training on the whole dataset before test')
- report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
- print(report.mean())
+report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
diff --git a/quapy/data/ b/quapy/data/
index 79e7eb3..96af189 100644
--- a/quapy/data/
+++ b/quapy/data/
@@ -4,6 +4,7 @@ import math
from quapy.protocol import AbstractProtocol
from pathlib import Path
def get_sample_list(path_dir):
"""Gets a sample list finding the csv files in a directory
@@ -19,6 +20,7 @@ def get_sample_list(path_dir):
return samples
def generate_modelselection_split(samples, split=0.3):
"""This function generates a train/test split for model selection
without the use of random numbers so the split is always the same
@@ -37,6 +39,7 @@ def generate_modelselection_split(samples, split=0.3):
train = [item for i, item in enumerate(samples) if i not in test_indices]
return train, test
class IFCBTrainSamplesFromDir(AbstractProtocol):
def __init__(self, path_dir:str, classes: list, samples: list = None):
@@ -64,6 +67,7 @@ class IFCBTrainSamplesFromDir(AbstractProtocol):
return len(self.samples)
class IFCBTestSamples(AbstractProtocol):
def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):
diff --git a/quapy/data/ b/quapy/data/
index 6e05f55..3d426f2 100644
--- a/quapy/data/
+++ b/quapy/data/
@@ -734,13 +734,14 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
- Loads the IFCB dataset for quantification `. For more
- information on this dataset check the zenodo site.
- This dataset is based on the data available publicly at .
- The scripts for the processing are available at
+ Loads the IFCB dataset for quantification from `Zenodo `_ (for more
+ information on this dataset, please follow the zenodo link).
+ This dataset is based on the data available publicly at
+ `WHOI-Plankton repo `_.
+ The scripts for the processing are available at `P. González's repo `_.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
The datasets are downloaded only once, and stored for fast reuse.
diff --git a/quapy/method/ b/quapy/method/
index a831bcb..f49bd3e 100644
--- a/quapy/method/
+++ b/quapy/method/
@@ -60,6 +60,19 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
+ def _check_non_empty_classes(self, data: LabelledCollection):
+ """
+ Asserts all classes have positive instances.
+ :param data: LabelledCollection
+ :return: Nothing. May raise an exception.
+ """
+ sample_prevs = data.prevalence()
+ empty_classes = np.argwhere(sample_prevs==0).flatten()
+ if len(empty_classes)>0:
+ empty_class_names = data.classes_[empty_classes]
+ raise ValueError(f'classes {empty_class_names} have no training examples')
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
@@ -93,6 +106,9 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
+ if fit_classifier:
+ self._check_non_empty_classes(data)
if predict_on is None:
predict_on = self.val_split
@@ -100,7 +116,6 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
if fit_classifier:*data.Xy)
predictions = None
elif isinstance(predict_on, float):
if fit_classifier:
if not (0. < predict_on < 1.):