diff --git a/examples/5.explicit_loss_minimization.py b/examples/17.explicit_loss_minimization.py similarity index 100% rename from examples/5.explicit_loss_minimization.py rename to examples/17.explicit_loss_minimization.py diff --git a/examples/4.using_pretrained_classifier.py b/examples/4.using_pretrained_classifier.py new file mode 100644 index 0000000..710048d --- /dev/null +++ b/examples/4.using_pretrained_classifier.py @@ -0,0 +1,75 @@ +""" +Aggregative quantifiers use an underlying classifier. Often, one has one pre-trained classifier available, and +needs to use this classifier at the basis of a quantification system. In such cases, the classifier should not +be retrained, but only used to issue classifier predictions for the quantifier. +In this example, we show how to instantiate a quantifier with a pre-trained classifier. +""" +from typing import List, Dict + +import quapy as qp +from quapy.method.aggregative import PACC +from sklearn.base import BaseEstimator, ClassifierMixin +from transformers import pipeline +import numpy as np +import quapy.functional as F + + +# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer +class HFTextClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'): + self.pipe = pipeline("sentiment-analysis", model=model_name) + self.classes_ = np.asarray([0,1]) + + def fit(self, X, y=None): + return self + + def _binary_decisions(self, transformer_output: List[Dict]): + return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int) + + def predict(self, X): + X = list(map(str, X)) + preds = self.pipe(X, truncation=True) + return self._binary_decisions(preds) + + def predict_proba(self, X): + X = list(map(str, X)) + n_examples = len(X) + preds = self.pipe(X, truncation=True) + decisions = self._binary_decisions(preds) + scores = np.array([p['score'] for p in preds], dtype=float) + probas = np.zeros(shape=(len(X), 2), dtype=float) + probas[np.arange(n_examples),decisions] = scores + probas[np.arange(n_examples),~decisions] = 1-scores + return probas + +# load a sentiment dataset +dataset = qp.datasets.fetch_reviews('imdb', tfidf=False) # raw text +train, test = dataset.training, dataset.test + +# instantiate a pre-trained classifier +clf = HFTextClassifier() + +# Let us fit a quantifier based on our pre-trained classifier. +# Note that, since the classifier is already fit, we will use the entire training set for +# learning the aggregation function of the quantifier. +# To do so, we only need to indicate "fit_classifier"=False, as follows: +quantifier = PACC(clf, fit_classifier=False) # Probabilistic Classify & Count using a pre-trained model + +print('training PACC...') +quantifier.fit(*train.Xy) + +# let us simulate some shifted test data... +new_prevalence = [0.75, 0.25] +shifted_test = test.sampling(500, *new_prevalence, random_state=0) + +# and do some evaluation +print('predicting with PACC...') +estim_prevalence = quantifier.predict(shifted_test.X) + +print('Result:\n'+('='*20)) +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'(shifted) test prevalence: {F.strprev(shifted_test.prevalence())}') +print(f'estimated prevalence: {F.strprev(estim_prevalence)}') + +absolute_error = qp.error.ae(new_prevalence, estim_prevalence) +print(f'absolute error={absolute_error:.4f}') \ No newline at end of file diff --git a/examples/4.lequa2022_experiments.py b/examples/5a.lequa2022_experiments.py similarity index 95% rename from examples/4.lequa2022_experiments.py rename to examples/5a.lequa2022_experiments.py index c9d1952..40632d5 100644 --- a/examples/4.lequa2022_experiments.py +++ b/examples/5a.lequa2022_experiments.py @@ -37,7 +37,7 @@ quantifier = EMQ(classifier=LogisticRegression(), val_split=5) param_grid = { 'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength 'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class - 'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7) + 'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7) } model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True) quantifier = model_selection.fit(Xtr, ytr) diff --git a/examples/4b.lequa2024_experiments.py b/examples/5b.lequa2024_experiments.py similarity index 100% rename from examples/4b.lequa2024_experiments.py rename to examples/5b.lequa2024_experiments.py diff --git a/quapy/error.py b/quapy/error.py index 201ab8f..eb42cd6 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -45,89 +45,95 @@ def acce(y_true, y_pred): return 1. - (y_true == y_pred).mean() -def mae(prevs, prevs_hat): +def mae(prevs_true, prevs_hat): """Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean absolute error """ - return ae(prevs, prevs_hat).mean() + return ae(prevs_true, prevs_hat).mean() -def ae(prevs, prevs_hat): +def ae(prevs_true, prevs_hat): """Computes the absolute error between the two prevalence vectors. Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`, where :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: absolute error """ - assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' - return abs(prevs_hat - prevs).mean(axis=-1) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs_true).mean(axis=-1) -def nae(prevs, prevs_hat): +def nae(prevs_true, prevs_hat): """Computes the normalized absolute error between the two prevalence vectors. Normalized absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`NAE(p,\\hat{p})=\\frac{AE(p,\\hat{p})}{z_{AE}}`, where :math:`z_{AE}=\\frac{2(1-\\min_{y\\in \\mathcal{Y}} p(y))}{|\\mathcal{Y}|}`, and :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: normalized absolute error """ - assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' - return abs(prevs_hat - prevs).sum(axis=-1)/(2*(1-prevs.min(axis=-1))) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs_true).sum(axis=-1)/(2 * (1 - prevs_true.min(axis=-1))) -def mnae(prevs, prevs_hat): +def mnae(prevs_true, prevs_hat): """Computes the mean normalized absolute error (see :meth:`quapy.error.nae`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean normalized absolute error """ - return nae(prevs, prevs_hat).mean() + return nae(prevs_true, prevs_hat).mean() -def mse(prevs, prevs_hat): +def mse(prevs_true, prevs_hat): """Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean squared error """ - return se(prevs, prevs_hat).mean() + return se(prevs_true, prevs_hat).mean() -def se(prevs, prevs_hat): +def se(prevs_true, prevs_hat): """Computes the squared error between the two prevalence vectors. Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`, where :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: absolute error """ - return ((prevs_hat - prevs) ** 2).mean(axis=-1) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + return ((prevs_hat - prevs_true) ** 2).mean(axis=-1) -def mkld(prevs, prevs_hat, eps=None): +def mkld(prevs_true, prevs_hat, eps=None): """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -137,10 +143,10 @@ def mkld(prevs, prevs_hat, eps=None): (which has thus to be set beforehand). :return: mean Kullback-Leibler distribution """ - return kld(prevs, prevs_hat, eps).mean() + return kld(prevs_true, prevs_hat, eps).mean() -def kld(prevs, prevs_hat, eps=None): +def kld(prevs_true, prevs_hat, eps=None): """Computes the Kullback-Leibler divergence between the two prevalence distributions. Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as @@ -149,7 +155,7 @@ def kld(prevs, prevs_hat, eps=None): where :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. @@ -158,17 +164,17 @@ def kld(prevs, prevs_hat, eps=None): :return: Kullback-Leibler divergence between the two distributions """ eps = __check_eps(eps) - smooth_prevs = smooth(prevs, eps) + smooth_prevs = smooth(prevs_true, eps) smooth_prevs_hat = smooth(prevs_hat, eps) return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1) -def mnkld(prevs, prevs_hat, eps=None): +def mnkld(prevs_true, prevs_hat, eps=None): """Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain @@ -177,10 +183,10 @@ def mnkld(prevs, prevs_hat, eps=None): (which has thus to be set beforehand). :return: mean Normalized Kullback-Leibler distribution """ - return nkld(prevs, prevs_hat, eps).mean() + return nkld(prevs_true, prevs_hat, eps).mean() -def nkld(prevs, prevs_hat, eps=None): +def nkld(prevs_true, prevs_hat, eps=None): """Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions. Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as @@ -189,7 +195,7 @@ def nkld(prevs, prevs_hat, eps=None): :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample @@ -197,16 +203,16 @@ def nkld(prevs, prevs_hat, eps=None): `SAMPLE_SIZE` (which has thus to be set beforehand). :return: Normalized Kullback-Leibler divergence between the two distributions """ - ekld = np.exp(kld(prevs, prevs_hat, eps)) + ekld = np.exp(kld(prevs_true, prevs_hat, eps)) return 2. * ekld / (1 + ekld) - 1. -def mrae(prevs, prevs_hat, eps=None): +def mrae(prevs_true, prevs_hat, eps=None): """Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -216,10 +222,10 @@ def mrae(prevs, prevs_hat, eps=None): the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). :return: mean relative absolute error """ - return rae(prevs, prevs_hat, eps).mean() + return rae(prevs_true, prevs_hat, eps).mean() -def rae(prevs, prevs_hat, eps=None): +def rae(prevs_true, prevs_hat, eps=None): """Computes the absolute relative error between the two prevalence vectors. Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as @@ -228,7 +234,7 @@ def rae(prevs, prevs_hat, eps=None): where :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. `rae` is not defined in cases in which the true distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the @@ -237,12 +243,12 @@ def rae(prevs, prevs_hat, eps=None): :return: relative absolute error """ eps = __check_eps(eps) - prevs = smooth(prevs, eps) + prevs_true = smooth(prevs_true, eps) prevs_hat = smooth(prevs_hat, eps) - return (abs(prevs - prevs_hat) / prevs).mean(axis=-1) + return (abs(prevs_true - prevs_hat) / prevs_true).mean(axis=-1) -def nrae(prevs, prevs_hat, eps=None): +def nrae(prevs_true, prevs_hat, eps=None): """Computes the normalized absolute relative error between the two prevalence vectors. Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as @@ -252,7 +258,7 @@ def nrae(prevs, prevs_hat, eps=None): and :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. `nrae` is not defined in cases in which the true distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the @@ -261,18 +267,18 @@ def nrae(prevs, prevs_hat, eps=None): :return: normalized relative absolute error """ eps = __check_eps(eps) - prevs = smooth(prevs, eps) + prevs_true = smooth(prevs_true, eps) prevs_hat = smooth(prevs_hat, eps) - min_p = prevs.min(axis=-1) - return (abs(prevs - prevs_hat) / prevs).sum(axis=-1)/(prevs.shape[-1]-1+(1-min_p)/min_p) + min_p = prevs_true.min(axis=-1) + return (abs(prevs_true - prevs_hat) / prevs_true).sum(axis=-1)/(prevs_true.shape[-1] - 1 + (1 - min_p) / min_p) -def mnrae(prevs, prevs_hat, eps=None): +def mnrae(prevs_true, prevs_hat, eps=None): """Computes the mean normalized relative absolute error (see :meth:`quapy.error.nrae`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -282,57 +288,61 @@ def mnrae(prevs, prevs_hat, eps=None): the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). :return: mean normalized relative absolute error """ - return nrae(prevs, prevs_hat, eps).mean() + return nrae(prevs_true, prevs_hat, eps).mean() -def nmd(prevs, prevs_hat): +def nmd(prevs_true, prevs_hat): """ Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor `1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction). - :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values :return: float in [0,1] """ - n = prevs.shape[-1] - return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat)) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + n = prevs_true.shape[-1] + return (1./(n-1))*np.mean(match_distance(prevs_true, prevs_hat)) -def bias_binary(prevs, prevs_hat): +def bias_binary(prevs_true, prevs_hat): """ Computes the (positive) bias in a binary problem. The bias is simply the difference between the predicted positive value and the true positive value, so that a positive such value indicates the prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise. :math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`, - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: binary bias """ - assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems' - return prevs_hat[...,1]-prevs[...,1] + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape[-1] == 2 and prevs_true.shape[-1] == 2, f'bias_binary can only be applied to binary problems' + return prevs_hat[...,1]-prevs_true[...,1] -def mean_bias_binary(prevs, prevs_hat): +def mean_bias_binary(prevs_true, prevs_hat): """ Computes the mean of the (positive) bias in a binary problem. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: mean binary bias """ - return np.mean(bias_binary(prevs, prevs_hat)) + return np.mean(bias_binary(prevs_true, prevs_hat)) -def md(prevs, prevs_hat, ERROR_TOL=1E-3): +def md(prevs_true, prevs_hat, ERROR_TOL=1E-3): """ Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in all cases. - :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values :return: float """ - P = np.cumsum(prevs, axis=-1) + P = np.cumsum(prevs_true, axis=-1) P_hat = np.cumsum(prevs_hat, axis=-1) assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \ 'arg error in match_distance: the array does not represent a valid distribution' @@ -349,6 +359,7 @@ def smooth(prevs, eps): :param eps: smoothing factor :return: array-like of shape `(n_classes,)` with the smoothed distribution """ + prevs = np.asarray(prevs) n_classes = prevs.shape[-1] return (prevs + eps) / (eps * n_classes + 1)