This commit is contained in:
Alejandro Moreo Fernandez 2024-04-12 13:35:13 +02:00
parent a04723a976
commit b53d417240
3 changed files with 15 additions and 20 deletions

View File

@ -1,20 +1,17 @@
import os import os
import pandas as pd import pandas as pd
import math import math
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.protocol import AbstractProtocol from quapy.protocol import AbstractProtocol
from pathlib import Path from pathlib import Path
def get_sample_list(path_dir): def get_sample_list(path_dir):
"""Gets a sample list finding the csv files in a directory """
Gets a sample list finding the csv files in a directory
Args: :param path_dir: directory to look for samples
path_dir (_type_): directory to look for samples :return: list of samples
Returns:
_type_: list of samples
""" """
samples = [] samples = []
for filename in sorted(os.listdir(path_dir)): for filename in sorted(os.listdir(path_dir)):
@ -23,18 +20,15 @@ def get_sample_list(path_dir):
return samples return samples
def generate_modelselection_split(samples, split=0.3): def generate_modelselection_split(samples, test_prop=0.3):
"""This function generates a train/test split for model selection """This function generates a train/test partition for model selection
without the use of random numbers so the split is always the same without the use of random numbers so the split is always the same
Args: :param samples: list of samples
samples (_type_): list of samples :param test_prop: float, percentage saved for test. Defaults to 0.3.
split (float, optional): percentage saved for test. Defaults to 0.3. :return: list of samples to use as train and list of samples to use as test
Returns:
_type_: list of samples to use as train and list of samples to use as test
""" """
num_items_to_pick = math.ceil(len(samples) * split) num_items_to_pick = math.ceil(len(samples) * test_prop)
step_size = math.floor(len(samples) / num_items_to_pick) step_size = math.floor(len(samples) / num_items_to_pick)
test_indices = [i * step_size for i in range(num_items_to_pick)] test_indices = [i * step_size for i in range(num_items_to_pick)]
test = [samples[i] for i in test_indices] test = [samples[i] for i in test_indices]

View File

@ -735,14 +735,15 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen return train, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
""" """
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
information on this dataset, please follow the zenodo link). information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at This dataset is based on the data available publicly at
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_. `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_. The dataset already comes with processed features.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. The scripts used for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
The datasets are downloaded only once, and stored for fast reuse. The datasets are downloaded only once, and stored for fast reuse.
@ -798,7 +799,7 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
if for_model_selection: if for_model_selection:
# In this case, return 70% of training data as the training set and 30% as the test set # In this case, return 70% of training data as the training set and 30% as the test set
samples = get_sample_list(train_samples_path) samples = get_sample_list(train_samples_path)
train, test = generate_modelselection_split(samples, split=0.3) train, test = generate_modelselection_split(samples, test_prop=0.3)
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train) train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
# Test prevalence is computed from class labels # Test prevalence is computed from class labels

View File

@ -577,7 +577,7 @@ class PACC(AggregativeSoftQuantifier):
raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}") raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}")
if self.method not in ACC.METHODS: if self.method not in ACC.METHODS:
raise ValueError(f"unknown method; valid ones are {ACC.METHODS}") raise ValueError(f"unknown method; valid ones are {ACC.METHODS}")
if self.clipping not in ACC.NORMALIZATIONS: if self.norm not in ACC.NORMALIZATIONS:
raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}") raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}")
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):