forked from moreo/QuaPy
added dataset IFCB plankton
This commit is contained in:
parent
cc5ab8ad70
commit
f18bce5f80
|
@ -0,0 +1,24 @@
|
||||||
|
import quapy as qp
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from quapy.evaluation import evaluation_report
|
||||||
|
|
||||||
|
|
||||||
|
def newLR():
|
||||||
|
return LogisticRegression(n_jobs=-1)
|
||||||
|
|
||||||
|
quantifiers = {'CC':qp.method.aggregative.CC(newLR()),
|
||||||
|
'ACC':qp.method.aggregative.ACC(newLR()),
|
||||||
|
'PCC':qp.method.aggregative.PCC(newLR()),
|
||||||
|
'PACC':qp.method.aggregative.PACC(newLR()),
|
||||||
|
'HDy':qp.method.aggregative.DistributionMatching(newLR()),
|
||||||
|
'EMQ':qp.method.aggregative.EMQ(newLR())}
|
||||||
|
|
||||||
|
for quant_name, quantifier in quantifiers.items():
|
||||||
|
print("Experiment with "+quant_name)
|
||||||
|
|
||||||
|
train, test_gen = qp.datasets.fetch_IFCB()
|
||||||
|
|
||||||
|
quantifier.fit(train)
|
||||||
|
|
||||||
|
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
|
||||||
|
print(report.mean())
|
|
@ -0,0 +1,34 @@
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from quapy.protocol import AbstractProtocol
|
||||||
|
|
||||||
|
class IFCBTrainSamplesFromDir(AbstractProtocol):
|
||||||
|
|
||||||
|
def __init__(self, path_dir:str, classes: list):
|
||||||
|
self.path_dir = path_dir
|
||||||
|
self.classes = classes
|
||||||
|
self.samples = []
|
||||||
|
for filename in os.listdir(path_dir):
|
||||||
|
if filename.endswith('.csv'):
|
||||||
|
self.samples.append(filename)
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
for sample in self.samples:
|
||||||
|
s = pd.read_csv(os.path.join(self.path_dir,sample))
|
||||||
|
# all columns but the first where we get the class
|
||||||
|
X = s.iloc[:, 1:].to_numpy()
|
||||||
|
y = s.iloc[:, 0].to_numpy()
|
||||||
|
yield X, y
|
||||||
|
|
||||||
|
class IFCBTestSamples(AbstractProtocol):
|
||||||
|
|
||||||
|
def __init__(self, path_dir:str, test_prevalences_path: str):
|
||||||
|
self.path_dir = path_dir
|
||||||
|
self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
for _, test_sample in self.test_prevalences.iterrows():
|
||||||
|
#Load the sample from disk
|
||||||
|
X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
|
||||||
|
prevalences = test_sample.iloc[1:].to_numpy().astype(float)
|
||||||
|
yield X, prevalences
|
|
@ -732,3 +732,82 @@ def fetch_lequa2022(task, data_home=None):
|
||||||
|
|
||||||
return train, val_gen, test_gen
|
return train, val_gen, test_gen
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_IFCB(single_sample_train=True, data_home=None):
|
||||||
|
"""
|
||||||
|
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
|
||||||
|
information on this dataset check the zenodo site.
|
||||||
|
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
|
||||||
|
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
|
||||||
|
|
||||||
|
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
||||||
|
|
||||||
|
The datasets are downloaded only once, and stored for fast reuse.
|
||||||
|
|
||||||
|
:param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
|
||||||
|
:class:`quapy.data.base.LabelledCollection` (all examples together).
|
||||||
|
If False, a generator of training samples will be returned.
|
||||||
|
Each example in the training set has an individual class label.
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:return: a tuple `(train, test_gen)` where `train` is an instance of
|
||||||
|
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
|
||||||
|
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
|
||||||
|
returns a series of samples labelled example by example.
|
||||||
|
test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`,
|
||||||
|
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
|
||||||
|
|
||||||
|
if data_home is None:
|
||||||
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
|
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
|
||||||
|
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
|
||||||
|
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
|
||||||
|
|
||||||
|
ifcb_dir = join(data_home, 'ifcb')
|
||||||
|
os.makedirs(ifcb_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def download_unzip_and_remove(unzipped_path, url):
|
||||||
|
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
|
||||||
|
download_file_if_not_exists(url, tmp_path)
|
||||||
|
with zipfile.ZipFile(tmp_path) as file:
|
||||||
|
file.extractall(unzipped_path)
|
||||||
|
os.remove(tmp_path)
|
||||||
|
|
||||||
|
if not os.path.exists(os.path.join(ifcb_dir,'train')):
|
||||||
|
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
|
||||||
|
if not os.path.exists(os.path.join(ifcb_dir,'test')):
|
||||||
|
download_unzip_and_remove(ifcb_dir, URL_TEST)
|
||||||
|
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
|
||||||
|
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
|
||||||
|
|
||||||
|
# Load test prevalences and classes
|
||||||
|
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
|
||||||
|
test_true_prev = pd.read_csv(test_true_prev_path)
|
||||||
|
classes = test_true_prev.columns[1:]
|
||||||
|
|
||||||
|
#Load train samples
|
||||||
|
train_samples_path = join(ifcb_dir,'train')
|
||||||
|
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
|
||||||
|
|
||||||
|
#Load test samples
|
||||||
|
test_samples_path = join(ifcb_dir,'test')
|
||||||
|
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
|
||||||
|
|
||||||
|
# In the case the user wants it, join all the train samples in one LabelledCollection
|
||||||
|
if single_sample_train:
|
||||||
|
X = []
|
||||||
|
y = []
|
||||||
|
for X_, y_ in train_gen():
|
||||||
|
X.append(X_)
|
||||||
|
y.append(y_)
|
||||||
|
|
||||||
|
X = np.vstack(X)
|
||||||
|
y = np.concatenate(y)
|
||||||
|
train = LabelledCollection(X,y, classes=classes)
|
||||||
|
return train, test_gen
|
||||||
|
else:
|
||||||
|
return train_gen, test_gen
|
||||||
|
|
Loading…
Reference in New Issue