merged
This commit is contained in:
parent
a04723a976
commit
b53d417240
|
@ -1,20 +1,17 @@
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.protocol import AbstractProtocol
|
from quapy.protocol import AbstractProtocol
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def get_sample_list(path_dir):
|
def get_sample_list(path_dir):
|
||||||
"""Gets a sample list finding the csv files in a directory
|
"""
|
||||||
|
Gets a sample list finding the csv files in a directory
|
||||||
|
|
||||||
Args:
|
:param path_dir: directory to look for samples
|
||||||
path_dir (_type_): directory to look for samples
|
:return: list of samples
|
||||||
|
|
||||||
Returns:
|
|
||||||
_type_: list of samples
|
|
||||||
"""
|
"""
|
||||||
samples = []
|
samples = []
|
||||||
for filename in sorted(os.listdir(path_dir)):
|
for filename in sorted(os.listdir(path_dir)):
|
||||||
|
@ -23,18 +20,15 @@ def get_sample_list(path_dir):
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
def generate_modelselection_split(samples, split=0.3):
|
def generate_modelselection_split(samples, test_prop=0.3):
|
||||||
"""This function generates a train/test split for model selection
|
"""This function generates a train/test partition for model selection
|
||||||
without the use of random numbers so the split is always the same
|
without the use of random numbers so the split is always the same
|
||||||
|
|
||||||
Args:
|
:param samples: list of samples
|
||||||
samples (_type_): list of samples
|
:param test_prop: float, percentage saved for test. Defaults to 0.3.
|
||||||
split (float, optional): percentage saved for test. Defaults to 0.3.
|
:return: list of samples to use as train and list of samples to use as test
|
||||||
|
|
||||||
Returns:
|
|
||||||
_type_: list of samples to use as train and list of samples to use as test
|
|
||||||
"""
|
"""
|
||||||
num_items_to_pick = math.ceil(len(samples) * split)
|
num_items_to_pick = math.ceil(len(samples) * test_prop)
|
||||||
step_size = math.floor(len(samples) / num_items_to_pick)
|
step_size = math.floor(len(samples) / num_items_to_pick)
|
||||||
test_indices = [i * step_size for i in range(num_items_to_pick)]
|
test_indices = [i * step_size for i in range(num_items_to_pick)]
|
||||||
test = [samples[i] for i in test_indices]
|
test = [samples[i] for i in test_indices]
|
||||||
|
|
|
@ -735,14 +735,15 @@ def fetch_lequa2022(task, data_home=None):
|
||||||
return train, val_gen, test_gen
|
return train, val_gen, test_gen
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
|
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
|
||||||
"""
|
"""
|
||||||
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
|
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
|
||||||
information on this dataset, please follow the zenodo link).
|
information on this dataset, please follow the zenodo link).
|
||||||
This dataset is based on the data available publicly at
|
This dataset is based on the data available publicly at
|
||||||
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
|
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
|
||||||
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
|
The dataset already comes with processed features.
|
||||||
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
The scripts used for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
|
||||||
|
|
||||||
The datasets are downloaded only once, and stored for fast reuse.
|
The datasets are downloaded only once, and stored for fast reuse.
|
||||||
|
|
||||||
|
@ -798,7 +799,7 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
|
||||||
if for_model_selection:
|
if for_model_selection:
|
||||||
# In this case, return 70% of training data as the training set and 30% as the test set
|
# In this case, return 70% of training data as the training set and 30% as the test set
|
||||||
samples = get_sample_list(train_samples_path)
|
samples = get_sample_list(train_samples_path)
|
||||||
train, test = generate_modelselection_split(samples, split=0.3)
|
train, test = generate_modelselection_split(samples, test_prop=0.3)
|
||||||
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
|
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
|
||||||
|
|
||||||
# Test prevalence is computed from class labels
|
# Test prevalence is computed from class labels
|
||||||
|
|
|
@ -577,7 +577,7 @@ class PACC(AggregativeSoftQuantifier):
|
||||||
raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}")
|
raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}")
|
||||||
if self.method not in ACC.METHODS:
|
if self.method not in ACC.METHODS:
|
||||||
raise ValueError(f"unknown method; valid ones are {ACC.METHODS}")
|
raise ValueError(f"unknown method; valid ones are {ACC.METHODS}")
|
||||||
if self.clipping not in ACC.NORMALIZATIONS:
|
if self.norm not in ACC.NORMALIZATIONS:
|
||||||
raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}")
|
raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}")
|
||||||
|
|
||||||
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
||||||
|
|
Loading…
Reference in New Issue