QuaPy/quapy/method/confidence.py

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix

import quapy as qp
import quapy.functional as F
from quapy.method import _bayesian
from quapy.method.aggregative import AggregativeCrispQuantifier
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier
from scipy.stats import chi2
from sklearn.utils import resample
from abc import ABC, abstractmethod
from scipy.special import softmax, factorial
import copy
from functools import lru_cache

"""
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
for AggregativeQuantifiers.
"""

class ConfidenceRegionABC(ABC):
    """
    Abstract class of confidence regions
    """

    @abstractmethod
    def point_estimate(self) -> np.ndarray:
        """
        Returns the point estimate corresponding to a set of bootstrap estimates.

        :return: np.ndarray
        """
        ...

    def ndim(self) -> int:
        """
        Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality
        of the simplex is therefore ndim-1

        :return: int
        """
        return len(self.point_estimate())

    @abstractmethod
    def coverage(self, true_value) -> float:
        """
        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
        fraction of these that are contained in the region, if more than one value is passed. If only one value is
        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.

        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        ...

    @lru_cache
    def simplex_portion(self):
        """
        Computes the fraction of the simplex which is covered by the region. This is not the volume of the region
        itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex
        contained in the region. A default implementation, based on Monte Carlo approximation, is provided.

        :return: float, the fraction of the simplex covered by the region
        """
        return self.montecarlo_proportion()

    @lru_cache
    def montecarlo_proportion(self, n_trials=10_000):
        """
        Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried
        out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that
        are included in the region. The value is only computed once when multiple calls are made.

        :return: float in [0,1]
        """
        with qp.util.temp_seed(0):
            uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
        proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
        return proportion


class WithConfidenceABC(ABC):
    """
    Abstract class for confidence regions.
    """
    METHODS = ['intervals', 'ellipse', 'ellipse-clr']

    @abstractmethod
    def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
        """
        Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
        also the confidence region around it.

        :param instances: a np.ndarray of shape (n_instances, n_features,)
        :confidence_level: float in (0, 1)
        :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
            (n_classes,) and  `conf_region` is an object from :class:`ConfidenceRegionABC`
        """
        ...

    @classmethod
    def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'):
        """
        Construct a confidence region given many prevalence estimations.

        :param prev_estims: np.ndarray of shape (n_estims, n_classes)
        :param confidence_level: float, the confidence level for the region (default 0.95)
        :param method: str, indicates the method for constructing regions. Set to `intervals` for
            constructing confidence intervals (default), or to `ellipse` for constructing an
            ellipse in the probability simplex, or to `ellipse-clr` for constructing an ellipse
            in the Centered-Log Ratio (CLR) unconstrained space.
        """
        region = None
        if method == 'intervals':
            region = ConfidenceIntervals(prev_estims, confidence_level=confidence_level)
        elif method == 'ellipse':
            region = ConfidenceEllipseSimplex(prev_estims, confidence_level=confidence_level)
        elif method == 'ellipse-clr':
            region = ConfidenceEllipseCLR(prev_estims, confidence_level=confidence_level)

        if region is None:
            raise NotImplementedError(f'unknown method {method}')

        return region

def simplex_volume(n):
    """
    Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume
    is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less.

    :param n: int, the dimensionality of the simplex
    :return: float, the volume of the n-dimensional simplex
    """
    return 1 / factorial(n)


def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
    """
    Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
    at a distance `chi2_critical`.

    :param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
    :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
    :param prec_matrix: a np.ndarray with the precision matrix (inverse of the
        covariance matrix) of the ellipse. If this inverse cannot be computed
        then None must be passed
    :param chi2_critical: float, the chi2 critical value

    :return: float in [0,1], the fraction of values that are contained in the ellipse
        defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
        If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
    """
    if prec_matrix is None:
        return 0.

    diff = values - mean  # Mahalanobis distance

    d_M_squared = diff @ prec_matrix @ diff.T  # d_M^2
    if d_M_squared.ndim == 2:
        d_M_squared = np.diag(d_M_squared)

    within_elipse = (d_M_squared <= chi2_critical)

    if isinstance(within_elipse, np.ndarray):
        within_elipse = np.mean(within_elipse)

    return within_elipse * 1.0


class ConfidenceEllipseSimplex(ConfidenceRegionABC):
    """
    Instantiates a Confidence Ellipse in the probability simplex.

    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
    :param confidence_level: float, the confidence level (default 0.95)
    """

    def __init__(self, X, confidence_level=0.95):

        assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)'

        X = np.asarray(X)

        self.mean_ = X.mean(axis=0)
        self.cov_ = np.cov(X, rowvar=False, ddof=1)

        try:
            self.precision_matrix_ = np.linalg.inv(self.cov_)
        except:
            self.precision_matrix_ = None

        self.dim = X.shape[-1]
        self.ddof = self.dim - 1

        # critical chi-square value
        self.confidence_level = confidence_level
        self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)

    def point_estimate(self):
        """
        Returns the point estimate, the center of the ellipse.

        :return: np.ndarray of shape (n_classes,)
        """
        return self.mean_

    def coverage(self, true_value):
        """
        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
        fraction of these that are contained in the region, if more than one value is passed. If only one value is
        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.

        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)


class ConfidenceEllipseCLR(ConfidenceRegionABC):
    """
    Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.

    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
    :param confidence_level: float, the confidence level (default 0.95)
    """

    def __init__(self, X, confidence_level=0.95):
        self.clr = CLRtransformation()
        Z = self.clr(X)
        self.mean_ = np.mean(X, axis=0)
        self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)

    def point_estimate(self):
        """
        Returns the point estimate, the center of the ellipse.

        :return: np.ndarray of shape (n_classes,)
        """
        # The inverse of the CLR does not coincide with the true mean, because the geometric mean
        # requires smoothing the prevalence vectors and this affects the softmax (inverse);
        # return self.clr.inverse(self.mean_) # <- does not coincide
        return self.mean_

    def coverage(self, true_value):
        """
        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
        fraction of these that are contained in the region, if more than one value is passed. If only one value is
        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.

        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        transformed_values = self.clr(true_value)
        return self.conf_region_clr.coverage(transformed_values)


class ConfidenceIntervals(ConfidenceRegionABC):
    """
    Instantiates a region based on (independent) Confidence Intervals.

    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
    :param confidence_level: float, the confidence level (default 0.95)
    """
    def __init__(self, X, confidence_level=0.95):
        assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'

        X = np.asarray(X)

        self.means_ = X.mean(axis=0)
        alpha = 1-confidence_level
        low_perc = (alpha/2.)*100
        high_perc = (1-alpha/2.)*100
        self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0)

    def point_estimate(self):
        """
        Returns the point estimate, the class-wise average of the bootstrapped estimates

        :return: np.ndarray of shape (n_classes,)
        """
        return self.means_

    def coverage(self, true_value):
        """
        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
        fraction of these that are contained in the region, if more than one value is passed. If only one value is
        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.

        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
        within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
        proportion = within_all_intervals.mean()

        return proportion


class CLRtransformation:
    """
    Centered log-ratio, from component analysis
    """
    def __call__(self, X, epsilon=1e-6):
        """
        Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
        actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`

        :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
        :param epsilon: small float for prevalence smoothing
        :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
        """
        X = np.asarray(X)
        X = qp.error.smooth(X, epsilon)
        G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True))  # geometric mean
        return np.log(X / G)

    def inverse(self, X):
        """
        Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.

        :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
        :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
        """
        return softmax(X, axis=-1)


class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
    """
    Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around
    point-estimates of class prevalence values. This method implements some optimizations for
    speeding up the computations, which are only possible due to the two phases of the aggregative
    quantifiers.

    During training, the bootstrap repetitions are only carried out over pre-classified training instances,
    after the classifier has been trained (only once), in order to train a series of aggregation
    functions (model-based approach).

    During inference, the bootstrap repetitions are applied to the pre-classified test instances.

    :param quantifier: an aggregative quantifier
    :para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
        model-based bootstrap approach)
    :para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
        population-based bootstrap approach)
    :param confidence_level: float, the confidence level for the confidence region (default 0.95)
    :param region: string, set to `intervals` for constructing confidence intervals (default), or to
        `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
        constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
    :param random_state: int for replicating samples, None (default) for non-replicable samples
    """

    def __init__(self,
                 quantifier: AggregativeQuantifier,
                 n_train_samples=1,
                 n_test_samples=500,
                 confidence_level=0.95,
                 region='intervals',
                 random_state=None):

        assert isinstance(quantifier, AggregativeQuantifier), \
            f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
        assert n_train_samples >= 1, \
            f'{n_train_samples=} must be >= 1'
        assert n_test_samples >= 1, \
            f'{n_test_samples=} must be >= 1'
        assert n_test_samples>1 or n_train_samples>1, \
            f'either {n_test_samples=} or {n_train_samples=} must be >1'

        self.quantifier = quantifier
        self.n_train_samples = n_train_samples
        self.n_test_samples = n_test_samples
        self.confidence_level = confidence_level
        self.region = region
        self.random_state = random_state

    def aggregation_fit(self, classif_predictions, labels):
        data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
        self.quantifiers = []
        if self.n_train_samples==1:
            self.quantifier.aggregation_fit(classif_predictions, labels)
            self.quantifiers.append(self.quantifier)
        else:
            # model-based bootstrap (only on the aggregative part)
            n_examples = len(data)
            full_index = np.arange(n_examples)
            with qp.util.temp_seed(self.random_state):
                for i in range(self.n_train_samples):
                    quantifier = copy.deepcopy(self.quantifier)
                    index = resample(full_index, n_samples=n_examples)
                    classif_predictions_i = classif_predictions.sampling_from_index(index)
                    data_i = data.sampling_from_index(index)
                    quantifier.aggregation_fit(classif_predictions_i, data_i)
                    self.quantifiers.append(quantifier)
        return self

    def aggregate(self, classif_predictions: np.ndarray):
        prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
        return prev_mean

    def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
        if confidence_level is None:
            confidence_level = self.confidence_level

        n_samples = classif_predictions.shape[0]
        prevs = []
        with qp.util.temp_seed(self.random_state):
            for quantifier in self.quantifiers:
                for i in range(self.n_test_samples):
                    sample_i = resample(classif_predictions, n_samples=n_samples)
                    prev_i = quantifier.aggregate(sample_i)
                    prevs.append(prev_i)

        conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
        prev_estim = conf.point_estimate()

        return prev_estim, conf

    def fit(self, X, y):
        self.quantifier._check_init_parameters()
        classif_predictions, labels = self.quantifier.classifier_fit_predict(X, y)
        self.aggregation_fit(classif_predictions, labels)
        return self

    def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
        predictions = self.quantifier.classify(instances)
        return self.aggregate_conf(predictions, confidence_level=confidence_level)

    @property
    def classifier(self):
        return self.quantifier.classifier

    def _classifier_method(self):
        return self.quantifier._classifier_method()


class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
    """
    `Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method,
    which is a variant of :class:`ACC` that calculates the posterior probability distribution
    over the prevalence vectors, rather than providing a point estimate obtained
    by matrix inversion.

    Can be used to diagnose degeneracy in the predictions visible when the confusion
    matrix has high condition number or to quantify uncertainty around the point estimate.

    This method relies on extra dependencies, which have to be installed via:
    `$ pip install quapy[bayes]`

    :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
        the one indicated in `qp.environ['DEFAULT_CLS']`
    :param fit_classifier: whether to train the learner (default is True). Set to False if the
        learner has been trained outside the quantifier.
    :param val_split: specifies the data used for generating classifier predictions. This specification
        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
        be extracted from the training set; or as an integer (default 5), indicating that the predictions
        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
        This hyperparameter is only meant to be used when the heuristics are to be applied, i.e., if a
        calibration is required. The default value is None (meaning the calibration is not required). In
        case this hyperparameter is set to a value other than None, but the calibration is not required
        (calib=None), a warning message will be raised.
    :param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
    :param num_samples: number of samples to draw from the posterior (default 1000)
    :param mcmc_seed: random seed for the MCMC sampler (default 0)
    :param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95)
    :param region: string, set to `intervals` for constructing confidence intervals (default), or to
        `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
        constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
    """
    def __init__(self,
                 classifier: BaseEstimator=None,
                 fit_classifier=True,
                 val_split: int = 5,
                 num_warmup: int = 500,
                 num_samples: int = 1_000,
                 mcmc_seed: int = 0,
                 confidence_level: float = 0.95,
                 region: str = 'intervals'):

        if num_warmup <= 0:
            raise ValueError(f'parameter {num_warmup=} must be a positive integer')
        if num_samples <= 0:
            raise ValueError(f'parameter {num_samples=} must be a positive integer')

        # if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
        #     raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')

        if _bayesian.DEPENDENCIES_INSTALLED is False:
            raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")

        super().__init__(classifier, fit_classifier, val_split)
        self.num_warmup = num_warmup
        self.num_samples = num_samples
        self.mcmc_seed = mcmc_seed
        self.confidence_level = confidence_level
        self.region = region

        # Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances
        # labeled as class y and predicted as class c.
        # By default, this array is set to None and later defined as part of the `aggregation_fit` phase
        self._n_and_c_labeled = None

        # Dictionary with posterior samples, set when `aggregate` is provided.
        self._samples = None

    def aggregation_fit(self, classif_predictions, labels):
        """
        Estimates the misclassification rates.

        :param classif_predictions: array-like with the label predictions returned by the classifier
        :param labels: array-like with the true labels associated to each classifier prediction
        """
        pred_labels = classif_predictions
        true_labels = labels
        self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels,
                                                 labels=self.classifier.classes_)

    def sample_from_posterior(self, classif_predictions):
        if self._n_and_c_labeled is None:
            raise ValueError("aggregation_fit must be called before sample_from_posterior")

        n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_).astype(float)

        self._samples = _bayesian.sample_posterior(
            n_c_unlabeled=n_c_unlabeled,
            n_y_and_c_labeled=self._n_and_c_labeled,
            num_warmup=self.num_warmup,
            num_samples=self.num_samples,
            seed=self.mcmc_seed,
        )
        return self._samples

    def get_prevalence_samples(self):
        if self._samples is None:
            raise ValueError("sample_from_posterior must be called before get_prevalence_samples")
        return self._samples[_bayesian.P_TEST_Y]

    def get_conditional_probability_samples(self):
        if self._samples is None:
            raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples")
        return self._samples[_bayesian.P_C_COND_Y]

    def aggregate(self, classif_predictions):
        samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y]
        return np.asarray(samples.mean(axis=0), dtype=float)

    def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
        classif_predictions = self.classify(instances)
        point_estimate = self.aggregate(classif_predictions)
        samples = self.get_prevalence_samples()  # available after calling "aggregate" function
        region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region)
        return point_estimate, region