diff --git a/TODO.txt b/TODO.txt index e69de29..8d15651 100644 --- a/TODO.txt +++ b/TODO.txt @@ -0,0 +1,3 @@ +- Test the return_type="index" in protocols and finish the "distributin_samples.py" example +- Add EDy (an implementation is available at quantificationlib) +- \ No newline at end of file diff --git a/examples/0.basics.py b/examples/0.basics.py index 6325434..aee7b5d 100644 --- a/examples/0.basics.py +++ b/examples/0.basics.py @@ -33,10 +33,8 @@ import quapy.functional as F # <- this module has some functional utilities, li print(f'training prevalence = {F.strprev(train.prevalence())}') # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier -# classifier = LogisticRegression() - -# pacc = qp.method.aggregative.PACC(classifier) -pacc = qp.method.aggregative.PACC() +classifier = LogisticRegression() +pacc = qp.method.aggregative.PACC(classifier) print(f'training {pacc}') pacc.fit(train) diff --git a/examples/distributing_samples.py b/examples/distributing_samples.py new file mode 100644 index 0000000..76a9731 --- /dev/null +++ b/examples/distributing_samples.py @@ -0,0 +1,38 @@ +""" +Imagine we want to generate many samples out of a collection, that we want to distribute for others to run their +own experiments in the very same test samples. One naive solution would come down to applying a given protocol to +our collection (say the artificial prevalence protocol on the 'academic-success' UCI dataset), store all those samples +on disk and make them available online. Distributing many such samples is undesirable. +In this example, we generate the indexes that allow anyone to regenerate the samples out of the original collection. +""" + +import quapy as qp +from quapy.method.aggregative import PACC +from quapy.protocol import UPP + +data = qp.datasets.fetch_UCIMulticlassDataset('academic-success') +train, test = data.train_test + +# let us train a quantifier to check whether we can actually replicate the results +quantifier = PACC() +quantifier.fit(train) + +# let us simulate our experimental results +protocol = UPP(test, sample_size=100, repeats=100, random_state=0) +our_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae') + +print(f'We have obtained a MAE={our_mae:.3f}') + +# let us distribute the indexes; we specify that we want the indexes, not the samples +protocol = UPP(test, sample_size=100, repeats=100, random_state=0, return_type='index') +indexes = protocol.samples_parameters() + +# Imagine we distribute the indexes; now we show how to replicate our experiments. +from quapy.protocol import ProtocolFromIndex +data = qp.datasets.fetch_UCIMulticlassDataset('academic-success') +train, test = data.train_test +protocol = ProtocolFromIndex(data=test, indexes=indexes) +their_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae') + +print(f'Another lab obtains a MAE={our_mae:.3f}') + diff --git a/quapy/error.py b/quapy/error.py index f867d5c..1b2f745 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -298,6 +298,31 @@ def nmd(prevs, prevs_hat): return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat)) +def bias_binary(prevs, prevs_hat): + """ + Computes the (positive) bias in a binary problem. The bias is simply the difference between the + predicted positive value and the true positive value, so that a positive such value indicates the + prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise. + :math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`, + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :return: binary bias + """ + assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems' + return prevs_hat[...,1]-prevs[...,1] + + +def mean_bias_binary(prevs, prevs_hat): + """ + Computes the mean of the (positive) bias in a binary problem. + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: mean binary bias + """ + return np.mean(bias_binary(prevs, prevs_hat)) + + def md(prevs, prevs_hat, ERROR_TOL=1E-3): """ Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in @@ -338,8 +363,8 @@ def __check_eps(eps=None): CLASSIFICATION_ERROR = {f1e, acce} -QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld} -QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld} +QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, mean_bias_binary} +QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, bias_binary} QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae} CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR} QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR} diff --git a/quapy/protocol.py b/quapy/protocol.py index 36362a9..9a7e5c4 100644 --- a/quapy/protocol.py +++ b/quapy/protocol.py @@ -1,4 +1,6 @@ from copy import deepcopy +from typing import Iterable + import quapy as qp import numpy as np import itertools @@ -62,6 +64,36 @@ class IterateProtocol(AbstractProtocol): return len(self.samples) +class ProtocolFromIndex(AbstractProtocol): + """ + A protocol from a list of indexes + + :param data: a :class:`quapy.data.base.LabelledCollection` + :param indexes: a list of indexes + """ + def __init__(self, data: LabelledCollection, indexes: Iterable): + self.data = data + self.indexes = indexes + + def __call__(self): + """ + Yields one sample at a time extracted using the indexes + + :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances + and in which `prev` is an `nd.array` with the class prevalence values + """ + for index in self.indexes: + yield self.data.sampling_from_index(index).Xp + + def total(self): + """ + Returns the number of samples in this protocol + + :return: int + """ + return len(self.indexes) + + class AbstractStochasticSeededProtocol(AbstractProtocol): """ An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g., @@ -124,9 +156,9 @@ class AbstractStochasticSeededProtocol(AbstractProtocol): if self.random_state is not None: stack.enter_context(qp.util.temp_seed(self.random_state)) for params in self.samples_parameters(): - yield self.collator(self.sample(params)) + yield self.collator(self.sample(params), params) - def collator(self, sample, *args): + def collator(self, sample, params): """ The collator prepares the sample to accommodate the desired output format before returning the output. This collator simply returns the sample as it is. Classes inheriting from this abstract class can @@ -191,9 +223,11 @@ class OnLabelledCollectionProtocol: assert return_type in cls.RETURN_TYPES, \ f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}' if return_type=='sample_prev': - return lambda lc:lc.Xp + return lambda lc,params:lc.Xp elif return_type=='labelled_collection': - return lambda lc:lc + return lambda lc,params:lc + elif return_type=='index': + return lambda lc,params:params class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):