270 lines
7.9 KiB
Python
270 lines
7.9 KiB
Python
import inspect
|
|
from abc import ABC, abstractmethod
|
|
from functools import lru_cache
|
|
|
|
import numpy as np
|
|
from pathlib import Path
|
|
import quapy as qp
|
|
|
|
from commons import MockClassifierFromPosteriors
|
|
from quapy.protocol import UPP
|
|
from quapy.data import LabelledCollection
|
|
from quapy.method.aggregative import EMQ
|
|
|
|
|
|
def fetchMNIST(modality, data_home='./data/mnist_basiccnn'):
|
|
MODALITY = ('features', 'predictions', 'logits')
|
|
assert modality in MODALITY, f'unknown modality, valid ones are {MODALITY}'
|
|
|
|
data_home = Path(data_home)
|
|
|
|
# Load training data
|
|
train_data = np.load(data_home/'mnist_basiccnn_train_out.npz')
|
|
train_X = train_data[modality]
|
|
train_y = train_data['targets']
|
|
|
|
# Load validation data
|
|
val_data = np.load(data_home/'mnist_basiccnn_val_out.npz')
|
|
val_X = val_data[modality]
|
|
val_y = val_data['targets']
|
|
|
|
# Load test data
|
|
test_data = np.load(data_home/'mnist_basiccnn_test_out.npz')
|
|
test_X = test_data[modality]
|
|
test_y = test_data['targets']
|
|
|
|
print(f'loaded MNIST ({modality=}): '
|
|
f'#train={len(train_y)}, #val={len(val_y)}, #test={len(test_y)}, #features={train_X.shape[1]}')
|
|
|
|
train = LabelledCollection(train_X, train_y)
|
|
val = LabelledCollection(val_X, val_y, classes=train.classes_)
|
|
test = LabelledCollection(test_X, test_y, classes=train.classes_)
|
|
|
|
return train, val, test
|
|
|
|
|
|
|
|
class DatasetHandler(ABC):
|
|
|
|
def __init__(self, name):
|
|
self.name = name
|
|
|
|
@classmethod
|
|
def get_defaults(cls):
|
|
sig = inspect.signature(cls.__init__)
|
|
|
|
defaults = {
|
|
name: param.default
|
|
for name, param in sig.parameters.items()
|
|
if param.default is not inspect.Parameter.empty
|
|
}
|
|
|
|
return defaults
|
|
|
|
@abstractmethod
|
|
def get_training(self): ...
|
|
|
|
@abstractmethod
|
|
def get_train_testprot_for_eval(self): ...
|
|
|
|
@abstractmethod
|
|
def get_train_valprot_for_modsel(self): ...
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def get_datasets(cls): ...
|
|
|
|
@classmethod
|
|
def iter(cls, **kwargs):
|
|
for name in cls.get_datasets():
|
|
yield cls(name, **kwargs)
|
|
|
|
def __repr__(self):
|
|
return f'{self.__class__.__name__}({self.name})'
|
|
|
|
@classmethod
|
|
@abstractmethod
|
|
def is_binary(self): ...
|
|
|
|
|
|
class MNISTHandler(DatasetHandler):
|
|
|
|
def __init__(self, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0):
|
|
super().__init__(name='MNIST')
|
|
self.n_val_samples = n_val_samples
|
|
self.n_test_samples = n_test_samples
|
|
self.sample_size = sample_size
|
|
self.random_state = random_state
|
|
|
|
@lru_cache(maxsize=None)
|
|
def dataset(self):
|
|
return fetchMNIST(modality='predictions')
|
|
|
|
def get_training(self):
|
|
return self.dataset()[0]
|
|
|
|
def get_validation(self):
|
|
return self.dataset()[1]
|
|
|
|
def get_train_testprot_for_eval(self):
|
|
# note that the training goes on the validation split, since the proper training was used for training the neural network
|
|
_, val, test = self.dataset()
|
|
test_prot = UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples, random_state=self.random_state)
|
|
return val, test_prot
|
|
|
|
def get_train_valprot_for_modsel(self):
|
|
# the training split is never used (was used to train a neural model)
|
|
# we consider the validation split as our training data, so we return a new split on it
|
|
_, val, _ = self.dataset()
|
|
train, val = val.split_stratified(train_prop=0.6, random_state=self.random_state)
|
|
val_prot = UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples, random_state=self.random_state)
|
|
return train, val_prot
|
|
|
|
@classmethod
|
|
def get_datasets(cls):
|
|
return ['MNIST']
|
|
|
|
@classmethod
|
|
def iter(cls, **kwargs):
|
|
yield cls(**kwargs)
|
|
|
|
def __repr__(self):
|
|
return f'{self.name}'
|
|
|
|
@classmethod
|
|
def is_binary(self):
|
|
return False
|
|
|
|
|
|
# LeQua multiclass tasks
|
|
class LeQuaHandler(DatasetHandler):
|
|
|
|
DATASETS = ['LeQua2022', 'LeQua2024']
|
|
|
|
def __init__(self, name):
|
|
super().__init__(name)
|
|
self.sample_size = 1000
|
|
|
|
def get_training(self):
|
|
return self.dataset()[0]
|
|
|
|
def get_train_testprot_for_eval(self):
|
|
training, _, test_generator = self.dataset()
|
|
return training, test_generator
|
|
|
|
def get_train_valprot_for_modsel(self):
|
|
training, val_generator, _ = self.dataset()
|
|
return training, val_generator
|
|
|
|
@lru_cache(maxsize=None)
|
|
def dataset(self):
|
|
if self.name=='LeQua2022':
|
|
return qp.datasets.fetch_lequa2022(task='T1B')
|
|
elif self.name=='LeQua2024':
|
|
return qp.datasets.fetch_lequa2024(task='T2')
|
|
else:
|
|
raise ValueError(f'unexpected dataset name {self.name}; valid ones are {self.DATASETS}')
|
|
|
|
def __repr__(self):
|
|
return self.name
|
|
|
|
@classmethod
|
|
def iter(cls):
|
|
for name in cls.DATASETS:
|
|
yield cls(name)
|
|
|
|
@classmethod
|
|
def is_binary(self):
|
|
return False
|
|
|
|
@classmethod
|
|
def get_datasets(cls):
|
|
return cls.DATASETS
|
|
|
|
|
|
class UCIDatasetHandler(DatasetHandler, ABC):
|
|
DATASETS = []
|
|
|
|
def __init__(self, name, n_val_samples, n_test_samples, sample_size, random_state):
|
|
super().__init__(name=name)
|
|
self.sample_size = sample_size
|
|
self.n_val_samples = n_val_samples
|
|
self.n_test_samples = n_test_samples
|
|
self.random_state = random_state
|
|
|
|
def get_training(self):
|
|
return self.dataset().training
|
|
|
|
def get_train_testprot_for_eval(self):
|
|
training, test = self.dataset().train_test
|
|
test_generator = qp.protocol.UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples,
|
|
random_state=self.random_state)
|
|
return training, test_generator
|
|
|
|
def get_train_valprot_for_modsel(self):
|
|
training = self.dataset().training
|
|
training, val = training.split_stratified(train_prop=0.6, random_state=self.random_state)
|
|
val_generator = qp.protocol.UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples,
|
|
random_state=self.random_state)
|
|
return training, val_generator
|
|
|
|
@classmethod
|
|
def get_datasets(cls):
|
|
return cls.DATASETS
|
|
|
|
@classmethod
|
|
def iter(cls):
|
|
for name in cls.DATASETS:
|
|
yield cls(name)
|
|
|
|
|
|
class UCIMulticlassHandler(UCIDatasetHandler):
|
|
|
|
DATASETS = [d for d in qp.datasets.UCI_MULTICLASS_DATASETS if d not in frozenset(['hcv', 'poker_hand'])]
|
|
|
|
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=1000, random_state=0):
|
|
super().__init__(name, n_val_samples, n_test_samples, sample_size, random_state)
|
|
|
|
@lru_cache(maxsize=None)
|
|
def dataset(self):
|
|
return qp.datasets.fetch_UCIMulticlassDataset(self.name, min_class_support=0.01)
|
|
|
|
def __repr__(self):
|
|
return f'{self.name}(UCI-multiclass)'
|
|
|
|
@classmethod
|
|
def is_binary(self):
|
|
return False
|
|
|
|
|
|
class UCIBinaryHandler(DatasetHandler):
|
|
|
|
DATASETS = qp.datasets.UCI_BINARY_DATASETS.copy()
|
|
|
|
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0):
|
|
super().__init__(name, n_val_samples, n_test_samples, sample_size, random_state)
|
|
|
|
@lru_cache(maxsize=None)
|
|
def dataset(self):
|
|
return qp.datasets.fetch_UCIBinaryDataset(self.name)
|
|
|
|
def __repr__(self):
|
|
return f'{self.name}(UCI-binary)'
|
|
|
|
|
|
@classmethod
|
|
def is_binary(self):
|
|
return True
|
|
|
|
|
|
if __name__ == '__main__':
|
|
train, val, test = fetchMNIST(modality='predictions')
|
|
cls = MockClassifierFromPosteriors()
|
|
cls.fit(*train.Xy)
|
|
# q = KDEyML(classifier=cls, fit_classifier=False)
|
|
# q = PACC(classifier=cls, fit_classifier=False)
|
|
q = EMQ(classifier=cls, fit_classifier=False)
|
|
q.fit(*val.Xy)
|
|
test_prot = UPP(test, repeats=100, sample_size=500)
|
|
report = qp.evaluation.evaluation_report(q, test_prot, verbose=True)
|
|
print(report.mean(numeric_only=True)) |