<span class="sig-prename descclassname"><span class="pre"></span></span><span class="sig-name descname"><span class="pre">fetch_IFCB</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">single_sample_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_IFCB"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="" title="Link to this definition"></a></dt>
<dd><p>Loads the IFCB dataset for quantification from <a class="reference external" href="">Zenodo</a> (for more
information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at
<a class="reference external" href="">WHOI-Plankton repo</a>.
The scripts for the processing are available at <a class="reference external" href="">P. González’s repo</a>.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</p>
<p>The datasets are downloaded only once, and stored for fast reuse.</p>
import numpy as np
import quapy as qp
from sklearn.linear_model import LogisticRegression
from quapy.model_selection import GridSearchQ
from quapy.evaluation import evaluation_report
def newLR():
return LogisticRegression(n_jobs=-1)
print('Quantifying the IFCB dataset with PACC\n')
# model selection
print('loading dataset for model selection...', end='')
train, val_gen = qp.datasets.fetch_IFCB(for_model_selection=True, single_sample_train=True)
print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
print(f'\tvalidation samples={}')
quantifiers = [
('CC', qp.method.aggregative.CC(newLR())),
('ACC', qp.method.aggregative.ACC(newLR())),
('PCC', qp.method.aggregative.PCC(newLR())),
('PACC', qp.method.aggregative.PACC(newLR())),
('HDy', qp.method.aggregative.DMy(newLR())),
('EMQ', qp.method.aggregative.EMQ(newLR()))
print('model selection starts')
quantifier = qp.method.aggregative.PACC(LogisticRegression())
mod_sel = GridSearchQ(
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced']
for quant_name, quantifier in quantifiers:
print(f'model selection chose hyperparameters: {mod_sel.best_params_}')
quantifier = mod_sel.best_model_
print("Experiment with "+quant_name)
train, test_gen = qp.datasets.fetch_IFCB()
print('loading dataset for test...', end='')
train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sample_train=True)
print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
print(f'\ttest samples={}')
print('training on the whole dataset before test')
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
from quapy.protocol import AbstractProtocol
from pathlib import Path
def get_sample_list(path_dir):
"""Gets a sample list finding the csv files in a directory
return samples
def generate_modelselection_split(samples, split=0.3):
"""This function generates a train/test split for model selection
without the use of random numbers so the split is always the same
train = [item for i, item in enumerate(samples) if i not in test_indices]
return train, test
class IFCBTrainSamplesFromDir(AbstractProtocol):
def __init__(self, path_dir:str, classes: list, samples: list = None):
@ -64,6 +67,7 @@ class IFCBTrainSamplesFromDir(AbstractProtocol):
return len(self.samples)
class IFCBTestSamples(AbstractProtocol):
def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):
@ -734,13 +734,14 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
Loads the IFCB dataset for quantification <>`. For more
information on this dataset check the zenodo site.
This dataset is based on the data available publicly at <>.
The scripts for the processing are available at <>
Loads the IFCB dataset for quantification from `Zenodo <>`_ (for more
information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at
`WHOI-Plankton repo <>`_.
The scripts for the processing are available at `P. González's repo <>`_.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
The datasets are downloaded only once, and stored for fast reuse.
def _check_non_empty_classes(self, data: LabelledCollection):
Asserts all classes have positive instances.
:param data: LabelledCollection
:return: Nothing. May raise an exception.
sample_prevs = data.prevalence()
empty_classes = np.argwhere(sample_prevs==0).flatten()
if len(empty_classes)>0:
empty_class_names = data.classes_[empty_classes]
raise ValueError(f'classes {empty_class_names} have no training examples')
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
if fit_classifier:
if predict_on is None:
predict_on = self.val_split
if fit_classifier:
predictions = None
elif isinstance(predict_on, float):
if fit_classifier:
if not (0. < predict_on < 1.):
