diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index aa6c8d1..908342b 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -14,6 +14,7 @@ from numbers import Number import jax import jax.numpy as jnp +from jax.scipy.special import logsumexp import numpyro import numpyro.distributions as dist from numpyro.infer import MCMC, NUTS @@ -245,9 +246,16 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): def _bayesian_numpyro(self, X_probs): kdes = self.mix_densities - test_densities = np.asarray( - [self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes] + # test_densities = np.asarray( + # [self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes] + # ) + test_log_densities = np.asarray( + [self.pdf(kde_i, X_probs, self.kernel, log_densities=True) for kde_i in kdes] ) + print(f'min={np.min(test_log_densities)}') + print(f'max={np.max(test_log_densities)}') + # import sys + # sys.exit(0) n_classes = X_probs.shape[-1] if isinstance(self.prior, str) and self.prior == 'uniform': @@ -265,7 +273,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): ) rng_key = jax.random.PRNGKey(self.mcmc_seed) - mcmc.run(rng_key, test_densities=test_densities, alpha=alpha) + mcmc.run(rng_key, test_log_densities=test_log_densities, alpha=alpha) samples_z = mcmc.get_samples()["z"] @@ -274,11 +282,11 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): return samples_prev - def _numpyro_model(self, test_densities, alpha): + def _numpyro_model(self, test_log_densities, alpha): """ test_densities: shape (n_classes, n_instances,) """ - n_classes = test_densities.shape[0] + n_classes = test_log_densities.shape[0] # sample in unconstrained R^(n_classes-1) z = numpyro.sample( @@ -288,6 +296,19 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): prev = self.ilr.inverse(z) # simplex, shape (n_classes,) + # for numerical stability: + # eps = 1e-10 + # prev_safe = jnp.clip(prev, eps, 1.0) + # prev_safe = prev_safe / jnp.sum(prev_safe) + # prev = prev_safe + + # from jax import debug + # debug.print("prev = {}", prev) + # numpyro.factor( + # "check_prev", + # jnp.where(jnp.all(prev > 0), 0.0, -jnp.inf) + # ) + # prior if alpha is not None: alpha = jnp.asarray(alpha) @@ -297,10 +318,20 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): # if alpha is None, then this corresponds to a weak logistic-normal prior # likelihood - test_densities = jnp.array(test_densities) - likelihoods = jnp.dot(prev, test_densities) + # test_densities = jnp.array(test_densities) + # likelihoods = jnp.dot(prev, test_densities) + # likelihoods = jnp.clip(likelihoods, 1e-12, jnp.inf) # numerical stability + # numpyro.factor( + # "loglik", (1.0 / self.temperature) * jnp.sum(jnp.log(likelihoods)) + # ) + + log_likelihood = jnp.sum( + logsumexp(jnp.log(prev)[:, None] + test_log_densities, axis=0) + ) numpyro.factor( - "loglik", (1.0 / self.temperature) * jnp.sum(jnp.log(likelihoods + 1e-10)) + "loglik", (1.0 / self.temperature) * log_likelihood ) + + diff --git a/BayesianKDEy/datasets.py b/BayesianKDEy/datasets.py index e418667..f4c80f3 100644 --- a/BayesianKDEy/datasets.py +++ b/BayesianKDEy/datasets.py @@ -12,37 +12,65 @@ from quapy.data import LabelledCollection from quapy.method.aggregative import EMQ -def fetchMNIST(modality, data_home='./data/mnist_basiccnn'): + +def fetchVisual(modality, dataset, net, data_home='./data'): MODALITY = ('features', 'predictions', 'logits') assert modality in MODALITY, f'unknown modality, valid ones are {MODALITY}' - data_home = Path(data_home) + file_prefix = f'{dataset}_{net}' + data_home = Path(data_home) / file_prefix # Load training data - train_data = np.load(data_home/'mnist_basiccnn_train_out.npz') + train_data = np.load(data_home/f'{file_prefix}_train_out.npz') train_X = train_data[modality] train_y = train_data['targets'] # Load validation data - val_data = np.load(data_home/'mnist_basiccnn_val_out.npz') + val_data = np.load(data_home/f'{file_prefix}_val_out.npz') val_X = val_data[modality] val_y = val_data['targets'] # Load test data - test_data = np.load(data_home/'mnist_basiccnn_test_out.npz') + test_data = np.load(data_home/f'{file_prefix}_test_out.npz') test_X = test_data[modality] test_y = test_data['targets'] - print(f'loaded MNIST ({modality=}): ' - f'#train={len(train_y)}, #val={len(val_y)}, #test={len(test_y)}, #features={train_X.shape[1]}') - train = LabelledCollection(train_X, train_y) val = LabelledCollection(val_X, val_y, classes=train.classes_) test = LabelledCollection(test_X, test_y, classes=train.classes_) + def show_prev_stats(data:LabelledCollection): + p = data.prevalence() + return f'prevs in [{min(p)*100:.3f}%, {max(p)*100:.3f}%]' + print(f'loaded {dataset} ({modality=}): ' + f'#train={len(train)}({show_prev_stats(train)}), ' + f'#val={len(val)}({show_prev_stats(val)}), ' + f'#test={len(test)}({show_prev_stats(test)}), ' + f'#features={train_X.shape[1]}, ' + f'#classes={len(set(train_y))}') + return train, val, test +def fetchMNIST(modality, data_home='./data'): + return fetchVisual(modality, dataset='mnist', net='basiccnn', data_home=data_home) + +def fetchCIFAR100coarse(modality, data_home='./data'): + return fetchVisual(modality, dataset='cifar100coarse', net='resnet18', data_home=data_home) + +def fetchCIFAR100(modality, data_home='./data'): + return fetchVisual(modality, dataset='cifar100', net='resnet18', data_home=data_home) + +def fetchCIFAR10(modality, data_home='./data'): + return fetchVisual(modality, dataset='cifar10', net='resnet18', data_home=data_home) + +def fetchFashionMNIST(modality, data_home='./data'): + return fetchVisual(modality, dataset='fashionmnist', net='basiccnn', data_home=data_home) + +def fetchSVHN(modality, data_home='./data'): + return fetchVisual(modality, dataset='svhn', net='resnet18', data_home=data_home) + + class DatasetHandler(ABC): @@ -87,10 +115,19 @@ class DatasetHandler(ABC): def is_binary(self): ... -class MNISTHandler(DatasetHandler): +class VisualDataHandler(DatasetHandler): - def __init__(self, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0): - super().__init__(name='MNIST') + def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0): + # mode features : feature-based, the idea is to learn a LogisticRegression on top + # mode predictions : posterior probabilities + # assert modality in ['features', 'predictions'], f'unknown {modality=}' + super().__init__(name=name) + modality = 'predictions' + if name.endswith('-f'): + modality = 'features' + elif name.endswith('-l'): + modality = 'logits' + self.modality = modality self.n_val_samples = n_val_samples self.n_test_samples = n_test_samples self.sample_size = sample_size @@ -98,35 +135,61 @@ class MNISTHandler(DatasetHandler): @lru_cache(maxsize=None) def dataset(self): - return fetchMNIST(modality='predictions') + name = self.name.lower() + name = name.replace('-f', '') + name = name.replace('-l', '') + + if name=='mnist': + data = fetchMNIST(modality=self.modality) + elif name=='cifar100coarse': + data = fetchCIFAR100coarse(modality=self.modality) + elif name=='cifar100': + data = fetchCIFAR100(modality=self.modality) + elif name=='cifar10': + data = fetchCIFAR10(modality=self.modality) + elif name=='fashionmnist': + data = fetchFashionMNIST(modality=self.modality) + elif name=='svhn': + data = fetchSVHN(modality=self.modality) + else: + raise ValueError(f'unknown dataset {name}') + + # the training set was used to extract features; + # we use the validation portion as a training set for quantifiers + net_train, val, test = data + train, val = val.split_stratified(train_prop=0.6, random_state=self.random_state) + return train, val, test def get_training(self): - return self.dataset()[0] + train, val, test = self.dataset() + return train def get_validation(self): - return self.dataset()[1] + train, val, test = self.dataset() + return val def get_train_testprot_for_eval(self): - # note that the training goes on the validation split, since the proper training was used for training the neural network - _, val, test = self.dataset() + train, val, test = self.dataset() test_prot = UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples, random_state=self.random_state) - return val, test_prot + return train+val, test_prot def get_train_valprot_for_modsel(self): - # the training split is never used (was used to train a neural model) - # we consider the validation split as our training data, so we return a new split on it - _, val, _ = self.dataset() - train, val = val.split_stratified(train_prop=0.6, random_state=self.random_state) + train, val, test = self.dataset() val_prot = UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples, random_state=self.random_state) return train, val_prot @classmethod def get_datasets(cls): - return ['MNIST'] + datasets = ['cifar10', 'mnist', 'cifar100coarse', 'fashionmnist', 'svhn'] #+ ['cifar100'] + # datasets_feat = [f'{d}-f' for d in datasets] + datasets_feat = [f'{d}-l' for d in datasets] + return datasets_feat # + datasets + @classmethod def iter(cls, **kwargs): - yield cls(**kwargs) + for name in cls.get_datasets(): + yield cls(name, **kwargs) def __repr__(self): return f'{self.name}' @@ -136,6 +199,18 @@ class MNISTHandler(DatasetHandler): return False +class CIFAR100Handler(VisualDataHandler): + def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=2000, random_state=0): + super().__init__(name=name, n_val_samples=n_val_samples, n_test_samples=n_test_samples, sample_size=sample_size, random_state=random_state) + + @classmethod + def get_datasets(cls): + datasets = ['cifar100'] + # datasets_feat = [f'{d}-f' for d in datasets] + datasets_feat = [f'{d}-l' for d in datasets] + return datasets_feat # + datasets + + # LeQua multiclass tasks class LeQuaHandler(DatasetHandler): diff --git a/BayesianKDEy/error_expected.py b/BayesianKDEy/error_expected.py new file mode 100644 index 0000000..1ca39be --- /dev/null +++ b/BayesianKDEy/error_expected.py @@ -0,0 +1,9 @@ +import quapy as qp + +from quapy.error import mae +import quapy.functional as F + +for n in range(2,100,5): + a = F.uniform_prevalence_sampling(n_classes=n, size=10_000) + b = F.uniform_prevalence_sampling(n_classes=n, size=10_000) + print(f'{n=} ae={mae(a, b):.5f}') \ No newline at end of file diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index ac5bb36..062eb09 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -7,7 +7,7 @@ from _bayeisan_kdey import BayesianKDEy from _bayesian_mapls import BayesianMAPLS from commons import experiment_path, KDEyCLR, RESULT_DIR, MockClassifierFromPosteriors # import datasets -from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, MNISTHandler +from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, VisualDataHandler, CIFAR100Handler from temperature_calibration import temp_calibration from build.lib.quapy.data import LabelledCollection from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC @@ -32,10 +32,11 @@ def methods(data_handler: DatasetHandler): - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the quantifier with optimized hyperparameters """ - if isinstance(data_handler, MNISTHandler): + if False: # isinstance(data_handler, VisualDataHandler): Cls = MockClassifierFromPosteriors cls_hyper = {} val_split = data_handler.get_validation().Xy # use this specific collection + pass else: Cls = LogisticRegression cls_hyper = {'classifier__C': np.logspace(-4,4,9), 'classifier__class_weight': ['balanced', None]} @@ -69,9 +70,9 @@ def methods(data_handler: DatasetHandler): # -------------------------------------------------------------------------------------------------------- # yield 'BayesianACC', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, mcmc_seed=0), multiclass_method #yield 'BayesianHDy', hdy, hdy_hyper, lambda hyper: PQ(Cls(), val_split=val_split, stan_seed=0, **hyper), only_binary - yield f'BaKDE-Ait-numpyro', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(), kernel='aitchison', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method + # yield f'BaKDE-Ait-numpyro', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(), kernel='aitchison', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method yield f'BaKDE-Gau-numpyro', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method - yield f'BaKDE-Ait-T*', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method + # yield f'BaKDE-Ait-T*', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method yield f'BaKDE-Gau-T*', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method # yield 'BayEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BayEMQ*', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method @@ -124,6 +125,7 @@ def experiment(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, best_hyperparams = qp.util.pickled_resource( hyper_choice_path, model_selection, dataset, cp(point_quantifier), grid ) + print(f'{best_hyperparams=}') t_init = time() uncertainty_quantifier = uncertainty_quant_constructor(best_hyperparams) @@ -177,7 +179,7 @@ if __name__ == '__main__': result_dir = RESULT_DIR - for data_handler in [MNISTHandler]:#, UCIMulticlassHandler,LeQuaHandler]: + for data_handler in [CIFAR100Handler, VisualDataHandler]:#, UCIMulticlassHandler,LeQuaHandler]: for dataset in data_handler.iter(): qp.environ['SAMPLE_SIZE'] = dataset.sample_size print(f'dataset={dataset.name}') diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index f4b013f..205b6b8 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -8,7 +8,7 @@ from glob import glob from pathlib import Path import quapy as qp from BayesianKDEy.commons import RESULT_DIR -from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, MNISTHandler +from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, VisualDataHandler, CIFAR100Handler from error import dist_aitchison from quapy.method.confidence import ConfidenceIntervals from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC @@ -119,7 +119,7 @@ n_classes = {} tr_size = {} tr_prev = {} -for dataset_handler in [UCIMulticlassHandler, LeQuaHandler, MNISTHandler]: +for dataset_handler in [UCIMulticlassHandler, LeQuaHandler, VisualDataHandler, CIFAR100Handler]: problem_type = 'binary' if dataset_handler.is_binary() else 'multiclass' path = f'./{base_dir}/{problem_type}/*.pkl' diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 2f12eb8..2f60c2b 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -61,7 +61,7 @@ class KDEBase: return KernelDensity(bandwidth=bandwidth).fit(X) - def pdf(self, kde, X, kernel): + def pdf(self, kde, X, kernel, log_densities=False): """ Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this function returns :math:`e^{s}` @@ -76,7 +76,11 @@ class KDEBase: elif kernel == 'ilr': X = self.ilr_transform(X) - return np.exp(kde.score_samples(X)) + log_density = kde.score_samples(X) + if log_densities: + return log_density + else: + return np.exp(log_density) def get_mixture_components(self, X, y, classes, bandwidth, kernel): """ @@ -93,6 +97,8 @@ class KDEBase: for cat in classes: selX = X[y==cat] if selX.size==0: + print(f'WARNING: empty class {cat}') + raise ValueError(f'WARNING: empty class {cat}') selX = [F.uniform_prevalence(len(classes))] class_cond_X.append(selX)