QuaPy/BayesianKDEy/datasets.py

270 lines
7.9 KiB
Python

import inspect
from abc import ABC, abstractmethod
from functools import lru_cache
import numpy as np
from pathlib import Path
import quapy as qp
from commons import MockClassifierFromPosteriors
from quapy.protocol import UPP
from quapy.data import LabelledCollection
from quapy.method.aggregative import EMQ
def fetchMNIST(modality, data_home='./data/mnist_basiccnn'):
MODALITY = ('features', 'predictions', 'logits')
assert modality in MODALITY, f'unknown modality, valid ones are {MODALITY}'
data_home = Path(data_home)
# Load training data
train_data = np.load(data_home/'mnist_basiccnn_train_out.npz')
train_X = train_data[modality]
train_y = train_data['targets']
# Load validation data
val_data = np.load(data_home/'mnist_basiccnn_val_out.npz')
val_X = val_data[modality]
val_y = val_data['targets']
# Load test data
test_data = np.load(data_home/'mnist_basiccnn_test_out.npz')
test_X = test_data[modality]
test_y = test_data['targets']
print(f'loaded MNIST ({modality=}): '
f'#train={len(train_y)}, #val={len(val_y)}, #test={len(test_y)}, #features={train_X.shape[1]}')
train = LabelledCollection(train_X, train_y)
val = LabelledCollection(val_X, val_y, classes=train.classes_)
test = LabelledCollection(test_X, test_y, classes=train.classes_)
return train, val, test
class DatasetHandler(ABC):
def __init__(self, name):
self.name = name
@classmethod
def get_defaults(cls):
sig = inspect.signature(cls.__init__)
defaults = {
name: param.default
for name, param in sig.parameters.items()
if param.default is not inspect.Parameter.empty
}
return defaults
@abstractmethod
def get_training(self): ...
@abstractmethod
def get_train_testprot_for_eval(self): ...
@abstractmethod
def get_train_valprot_for_modsel(self): ...
@classmethod
@abstractmethod
def get_datasets(cls): ...
@classmethod
def iter(cls, **kwargs):
for name in cls.get_datasets():
yield cls(name, **kwargs)
def __repr__(self):
return f'{self.__class__.__name__}({self.name})'
@classmethod
@abstractmethod
def is_binary(self): ...
class MNISTHandler(DatasetHandler):
def __init__(self, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0):
super().__init__(name='MNIST')
self.n_val_samples = n_val_samples
self.n_test_samples = n_test_samples
self.sample_size = sample_size
self.random_state = random_state
@lru_cache(maxsize=None)
def dataset(self):
return fetchMNIST(modality='predictions')
def get_training(self):
return self.dataset()[0]
def get_validation(self):
return self.dataset()[1]
def get_train_testprot_for_eval(self):
# note that the training goes on the validation split, since the proper training was used for training the neural network
_, val, test = self.dataset()
test_prot = UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples, random_state=self.random_state)
return val, test_prot
def get_train_valprot_for_modsel(self):
# the training split is never used (was used to train a neural model)
# we consider the validation split as our training data, so we return a new split on it
_, val, _ = self.dataset()
train, val = val.split_stratified(train_prop=0.6, random_state=self.random_state)
val_prot = UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples, random_state=self.random_state)
return train, val_prot
@classmethod
def get_datasets(cls):
return ['MNIST']
@classmethod
def iter(cls, **kwargs):
yield cls(**kwargs)
def __repr__(self):
return f'{self.name}'
@classmethod
def is_binary(self):
return False
# LeQua multiclass tasks
class LeQuaHandler(DatasetHandler):
DATASETS = ['LeQua2022', 'LeQua2024']
def __init__(self, name):
super().__init__(name)
self.sample_size = 1000
def get_training(self):
return self.dataset()[0]
def get_train_testprot_for_eval(self):
training, _, test_generator = self.dataset()
return training, test_generator
def get_train_valprot_for_modsel(self):
training, val_generator, _ = self.dataset()
return training, val_generator
@lru_cache(maxsize=None)
def dataset(self):
if self.name=='LeQua2022':
return qp.datasets.fetch_lequa2022(task='T1B')
elif self.name=='LeQua2024':
return qp.datasets.fetch_lequa2024(task='T2')
else:
raise ValueError(f'unexpected dataset name {self.name}; valid ones are {self.DATASETS}')
def __repr__(self):
return self.name
@classmethod
def iter(cls):
for name in cls.DATASETS:
yield cls(name)
@classmethod
def is_binary(self):
return False
@classmethod
def get_datasets(cls):
return cls.DATASETS
class UCIDatasetHandler(DatasetHandler, ABC):
DATASETS = []
def __init__(self, name, n_val_samples, n_test_samples, sample_size, random_state):
super().__init__(name=name)
self.sample_size = sample_size
self.n_val_samples = n_val_samples
self.n_test_samples = n_test_samples
self.random_state = random_state
def get_training(self):
return self.dataset().training
def get_train_testprot_for_eval(self):
training, test = self.dataset().train_test
test_generator = qp.protocol.UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples,
random_state=self.random_state)
return training, test_generator
def get_train_valprot_for_modsel(self):
training = self.dataset().training
training, val = training.split_stratified(train_prop=0.6, random_state=self.random_state)
val_generator = qp.protocol.UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples,
random_state=self.random_state)
return training, val_generator
@classmethod
def get_datasets(cls):
return cls.DATASETS
@classmethod
def iter(cls):
for name in cls.DATASETS:
yield cls(name)
class UCIMulticlassHandler(UCIDatasetHandler):
DATASETS = [d for d in qp.datasets.UCI_MULTICLASS_DATASETS if d not in frozenset(['hcv', 'poker_hand'])]
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=1000, random_state=0):
super().__init__(name, n_val_samples, n_test_samples, sample_size, random_state)
@lru_cache(maxsize=None)
def dataset(self):
return qp.datasets.fetch_UCIMulticlassDataset(self.name, min_class_support=0.01)
def __repr__(self):
return f'{self.name}(UCI-multiclass)'
@classmethod
def is_binary(self):
return False
class UCIBinaryHandler(DatasetHandler):
DATASETS = qp.datasets.UCI_BINARY_DATASETS.copy()
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0):
super().__init__(name, n_val_samples, n_test_samples, sample_size, random_state)
@lru_cache(maxsize=None)
def dataset(self):
return qp.datasets.fetch_UCIBinaryDataset(self.name)
def __repr__(self):
return f'{self.name}(UCI-binary)'
@classmethod
def is_binary(self):
return True
if __name__ == '__main__':
train, val, test = fetchMNIST(modality='predictions')
cls = MockClassifierFromPosteriors()
cls.fit(*train.Xy)
# q = KDEyML(classifier=cls, fit_classifier=False)
# q = PACC(classifier=cls, fit_classifier=False)
q = EMQ(classifier=cls, fit_classifier=False)
q.fit(*val.Xy)
test_prot = UPP(test, repeats=100, sample_size=500)
report = qp.evaluation.evaluation_report(q, test_prot, verbose=True)
print(report.mean(numeric_only=True))