QuaPy/BayesianKDEy/commons.py

import os
from functools import lru_cache
from pathlib import Path

from jax import numpy as jnp
from sklearn.base import BaseEstimator

import error
import functional as F

import quapy as qp
import numpy as np

from method.aggregative import KDEyML
from quapy.functional import l1_norm, ILRtransformation
from scipy.stats import entropy
from abc import ABC, abstractmethod


FINEGRAINED = True
RESULT_DIR = Path('results_finegrained') if FINEGRAINED else Path('results')


class DatasetHandler(ABC):

    def __init__(self, name:str, sample_size:int):
        self._name = name
        self._sample_size = sample_size

    @abstractmethod
    def get_training(self): ...

    @abstractmethod
    def get_train_testprot_for_eval(self): ...

    @abstractmethod
    def get_train_valprot_for_modsel(self): ...

    def sample_size(self):
        return self._sample_size

    def name(self):
        return self._name

    @classmethod
    @abstractmethod
    def iter(cls): ...

    def __repr__(self):
        return self.__class__.__name__

    @classmethod
    @abstractmethod
    def is_binary(self):
        ...


class UCIMulticlassHandler(DatasetHandler):

    DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS.copy()

    def __init__(self, name, n_val_samples=100, n_test_samples=100):
        super().__init__(name, sample_size=1000)
        self._dataset = None  # lazy fetch
        self.n_val_samples = n_val_samples
        self.n_test_samples = n_test_samples

    def get_training(self):
        return self.dataset().training

    def get_train_testprot_for_eval(self):
        training, test = self.dataset().train_test
        test_generator = qp.protocol.UPP(test, repeats=self.n_test_samples, random_state=0)
        return training, test_generator

    def get_train_valprot_for_modsel(self):
        training = self.dataset().training
        training, val = training.split_stratified(train_prop=0.6, random_state=0)
        val_generator = qp.protocol.UPP(val, repeats=self.n_val_samples, random_state=0)
        return training, val_generator

    @lru_cache(maxsize=None)
    def dataset(self):
        if self._dataset is None:
            self._dataset = qp.datasets.fetch_UCIMulticlassDataset(self.name(), min_class_support=0.01)
        return self._dataset

    def __repr__(self):
        return "" # self.dataset().__repr__()

    @classmethod
    def iter(cls):
        for name in cls.DATASETS:
            yield cls(name)

    @classmethod
    def is_binary(self):
        return False


class LeQuaHandler(DatasetHandler):

    DATASETS = ['LeQua2022', 'LeQua2024']

    def __init__(self, name):
        super().__init__(name, sample_size=1000)
        self._dataset = None  # lazy fetch

    def get_training(self):
        return self.dataset()[0]

    def get_train_testprot_for_eval(self):
        training, _, test_generator = self.dataset()
        return training, test_generator

    def get_train_valprot_for_modsel(self):
        training, val_generator, _ = self.dataset()
        return training, val_generator

    @lru_cache(maxsize=None)
    def dataset(self):
        if self._dataset is None:
            if self.name()=='LeQua2022':
                self._dataset = qp.datasets.fetch_lequa2022(task='T1B')
            elif self.name()=='LeQua2024':
                self._dataset = qp.datasets.fetch_lequa2024(task='T2')
            else:
                raise ValueError(f'unexpected dataset name {self.name()}; valid ones are {self.DATASETS}')
        return self._dataset

    def __repr__(self):
        return self.dataset().__repr__()

    @classmethod
    def iter(cls):
        for name in cls.DATASETS:
            yield cls(name)

    @classmethod
    def is_binary(self):
        return False

# def fetch_UCI_multiclass(data_name):
#     return qp.datasets.fetch_UCIMulticlassDataset(data_name, min_class_support=0.01)


def fetch_UCI_binary(data_name):
    return qp.datasets.fetch_UCIBinaryDataset(data_name)

# global configurations

binary = {
    'datasets': qp.datasets.UCI_BINARY_DATASETS.copy(),
    'fetch_fn': fetch_UCI_binary,
    'sample_size': 500
}

# multiclass = {
#     'datasets': qp.datasets.UCI_MULTICLASS_DATASETS.copy(),
#     'fetch_fn': fetch_UCI_multiclass,
#     'sample_size': 1000
# }
# try:
#     multiclass['datasets'].remove('poker_hand')  # random performance
#     multiclass['datasets'].remove('hcv')  # random performance
#     multiclass['datasets'].remove('letter')  # many classes
#     multiclass['datasets'].remove('isolet')  # many classes
# except ValueError:
#     pass


# utils
def experiment_path(dir:Path, dataset_name:str, method_name:str):
    os.makedirs(dir, exist_ok=True)
    return dir/f'{dataset_name}__{method_name}.pkl'


def normalized_entropy(p):
    """
    Normalized Shannon entropy in [0, 1]
    p: array-like, prevalence vector (sums to 1)
    """
    p = np.asarray(p)
    H = entropy(p) # Shannon entropy
    H_max = np.log(len(p))
    return np.clip(H / H_max, 0, 1)


def antagonistic_prevalence(p, strength=1):
    ilr = ILRtransformation()
    z = ilr(p)
    z_ant = - strength * z
    p_ant = ilr.inverse(z_ant)
    return p_ant


class KDEyCLR(KDEyML):
    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
        super().__init__(
            classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
            random_state=random_state, kernel='aitchison'
        )


class KDEyILR(KDEyML):
    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
        super().__init__(
            classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
            random_state=random_state, kernel='ilr'
        )


class ILRtransformation(F.CompositionalTransformation):
    def __init__(self, jax_mode=False):
        self.jax_mode = jax_mode

    def array(self, X):
        if self.jax_mode:
            return jnp.array(X)
        else:
            return np.asarray(X)

    def __call__(self, X):
        X = self.array(X)
        X = qp.error.smooth(X, self.EPSILON)
        k = X.shape[-1]
        V = self.array(self.get_V(k))
        logp = jnp.log(X) if self.jax_mode else np.log(X)
        return logp @ V.T

    def inverse(self, Z):
        Z = self.array(Z)
        k_minus_1 = Z.shape[-1]
        k = k_minus_1 + 1
        V = self.array(self.get_V(k))
        logp = Z @ V
        p = jnp.exp(logp) if self.jax_mode else np.exp(logp)
        p = p / jnp.sum(p, axis=-1, keepdims=True) if self.jax_mode else p / np.sum(p, axis=-1, keepdims=True)
        return p

    @lru_cache(maxsize=None)
    def get_V(self, k):
        def helmert_matrix(k):
            """
            Returns the (k x k) Helmert matrix.
            """
            H = np.zeros((k, k))
            for i in range(1, k):
                H[i, :i] = 1
                H[i, i] = -(i)
                H[i] = H[i] / np.sqrt(i * (i + 1))
            # row 0 stays zeros; will be discarded
            return H

        def ilr_basis(k):
            """
            Constructs an orthonormal ILR basis using the Helmert submatrix.
            Output shape: (k-1, k)
            """
            H = helmert_matrix(k)
            V = H[1:, :]  # remove first row of zeros
            return V

        return ilr_basis(k)


def in_simplex(x):
    return np.all(x >= 0) and np.isclose(x.sum(), 1)