added dataset IFCB plankton

2023-11-08 11:07:47 +01:00 · 2023-11-08 11:07:47 +01:00 · f18bce5f80
parent cc5ab8ad70
commit f18bce5f80
3 changed files with 137 additions and 0 deletions
--- a/examples/ifcb_experiments.py
+++ b/examples/ifcb_experiments.py
@ -0,0 +1,24 @@
 import quapy as qp
 from sklearn.linear_model import LogisticRegression
 from quapy.evaluation import evaluation_report
 def newLR():
    return LogisticRegression(n_jobs=-1)
 quantifiers = {'CC':qp.method.aggregative.CC(newLR()),
               'ACC':qp.method.aggregative.ACC(newLR()),
               'PCC':qp.method.aggregative.PCC(newLR()),
               'PACC':qp.method.aggregative.PACC(newLR()),
               'HDy':qp.method.aggregative.DistributionMatching(newLR()),
               'EMQ':qp.method.aggregative.EMQ(newLR())}
 for quant_name, quantifier in quantifiers.items():
    print("Experiment with "+quant_name)
    train, test_gen = qp.datasets.fetch_IFCB()
    quantifier.fit(train)
    report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
    print(report.mean())
--- a/quapy/data/_ifcb.py
+++ b/quapy/data/_ifcb.py
@ -0,0 +1,34 @@
 import os
 import pandas as pd
 from quapy.protocol import AbstractProtocol
 class IFCBTrainSamplesFromDir(AbstractProtocol):
    def __init__(self, path_dir:str, classes: list):
        self.path_dir = path_dir
        self.classes = classes
        self.samples = []
        for filename in os.listdir(path_dir):
            if filename.endswith('.csv'):
                self.samples.append(filename)
    def __call__(self):
        for sample in self.samples:
            s = pd.read_csv(os.path.join(self.path_dir,sample))
            # all columns but the first where we get the class
            X = s.iloc[:, 1:].to_numpy()
            y = s.iloc[:, 0].to_numpy()
            yield X, y
 class IFCBTestSamples(AbstractProtocol):
    def __init__(self, path_dir:str, test_prevalences_path: str):
        self.path_dir = path_dir
        self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
    def __call__(self):
        for _, test_sample in self.test_prevalences.iterrows():
            #Load the sample from disk
            X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
            prevalences = test_sample.iloc[1:].to_numpy().astype(float)
            yield X, prevalences
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -732,3 +732,82 @@ def fetch_lequa2022(task, data_home=None):
    return train, val_gen, test_gen
 def fetch_IFCB(single_sample_train=True, data_home=None):
    """
    Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
    information on this dataset check the zenodo site.
    This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
    The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
    Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
    The datasets are downloaded only once, and stored for fast reuse.
    :param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
        :class:`quapy.data.base.LabelledCollection` (all examples together).
        If False, a generator of training samples will be returned.
        Each example in the training set has an individual class label.
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :return: a tuple `(train, test_gen)` where `train` is an instance of
        :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
        :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
        returns a series of samples labelled example by example.
        test_gen is an instance of  :class:`quapy.data._ifcb.IFCBTestSamples`,
        i.e., a sampling protocol that returns a series of samples labelled by prevalence.
    """
    from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
    if data_home is None:
        data_home = get_quapy_home()
    URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
    ifcb_dir = join(data_home, 'ifcb')
    os.makedirs(ifcb_dir, exist_ok=True)
    def download_unzip_and_remove(unzipped_path, url):
        tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)
    if not os.path.exists(os.path.join(ifcb_dir,'train')):
        download_unzip_and_remove(ifcb_dir, URL_TRAIN)
    if not os.path.exists(os.path.join(ifcb_dir,'test')):
        download_unzip_and_remove(ifcb_dir, URL_TEST)
    if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
    # Load test prevalences and classes
    test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
    test_true_prev = pd.read_csv(test_true_prev_path)
    classes = test_true_prev.columns[1:]
    #Load train samples
    train_samples_path = join(ifcb_dir,'train')
    train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
    #Load test samples
    test_samples_path = join(ifcb_dir,'test')
    test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
    # In the case the user wants it, join all the train samples in one LabelledCollection
    if single_sample_train:
        X = []
        y = []
        for X_, y_ in train_gen():
            X.append(X_)
            y.append(y_)   
        X = np.vstack(X)
        y = np.concatenate(y)
        train = LabelledCollection(X,y, classes=classes)
        return train, test_gen
    else:
        return train_gen, test_gen