From ba18d003340aa9e6ee46f8395b35c0d2f02a50af Mon Sep 17 00:00:00 2001
From: Alex Moreo
Date: Mon, 20 Dec 2021 11:39:44 +0100
Subject: [PATCH 01/67] trying to figure out how to refactor protocols
meaninguflly
---
quapy/__init__.py | 2 +-
quapy/data/base.py | 85 ++++++----------------------------------
quapy/functional.py | 30 --------------
quapy/model_selection.py | 2 +-
4 files changed, 15 insertions(+), 104 deletions(-)
diff --git a/quapy/__init__.py b/quapy/__init__.py
index a1ccee4..ad69ae9 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -10,7 +10,7 @@ from . import model_selection
from . import classification
from quapy.method.base import isprobabilistic, isaggregative
-__version__ = '0.1.6'
+__version__ = '0.1.7'
environ = {
'SAMPLE_SIZE': None,
diff --git a/quapy/data/base.py b/quapy/data/base.py
index a59d8d2..cfe2891 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -3,7 +3,7 @@ from scipy.sparse import issparse
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
-from quapy.functional import artificial_prevalence_sampling, strprev
+from quapy.functional import strprev
class LabelledCollection:
@@ -120,21 +120,24 @@ class LabelledCollection:
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
- taken = 0
- indexes_sample = []
- for i, class_ in enumerate(self.classes_):
- if i == self.n_classes - 1:
- n_requested = size - taken
- else:
- n_requested = int(size * prevs[i])
+ # Decide how many instances should be taken for each class in order to satisfy the requested prevalence
+ # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is
+ # <= size * prevs[i]) examples are drawn from class i, there could be a remainder number of instances to take
+ # to satisfy the size constrain. The remainder is distributed along the classes with probability = prevs.
+ # (This aims at avoiding the remainder to be placed in a class for which the prevalence requested is 0.)
+ n_requests = {class_: int(size * prevs[i]) for i, class_ in enumerate(self.classes_)}
+ remainder = size - sum(n_requests.values())
+ for rand_class in np.random.choice(self.classes_, size=remainder, p=prevs):
+ n_requests[rand_class] += 1
+ indexes_sample = []
+ for class_, n_requested in n_requests.items():
n_candidates = len(self.index[class_])
index_sample = self.index[class_][
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
] if n_requested > 0 else []
indexes_sample.append(index_sample)
- taken += n_requested
indexes_sample = np.concatenate(indexes_sample).astype(int)
@@ -152,7 +155,7 @@ class LabelledCollection:
:param size: integer, the size of the uniform sample
:return: a np.ndarray of shape `(size)` with the indexes
"""
- return np.random.choice(len(self), size, replace=False)
+ return np.random.choice(len(self), size, replace=size > len(self))
def sampling(self, size, *prevs, shuffle=True):
"""
@@ -212,68 +215,6 @@ class LabelledCollection:
random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
- def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
- """
- A generator of samples that implements the artificial prevalence protocol (APP).
- The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
- [0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
- prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
- [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
- combination of prevalence values is indicated by `repeats`.
-
- :param sample_size: the number of instances in each sample
- :param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
- limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
- :param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
- :return: yield samples generated at artificially controlled prevalence values
- """
- dimensions = self.n_classes
- for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
- yield self.sampling(sample_size, *prevs)
-
- def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
- """
- A generator of sample indexes implementing the artificial prevalence protocol (APP).
- The APP consists of exploring
- a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
- prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
- [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
- combination of prevalence values is indicated by `repeats`
-
- :param sample_size: the number of instances in each sample (i.e., length of each index)
- :param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
- limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
- :param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
- :return: yield the indexes that generate the samples according to APP
- """
- dimensions = self.n_classes
- for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
- yield self.sampling_index(sample_size, *prevs)
-
- def natural_sampling_generator(self, sample_size, repeats=100):
- """
- A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
- samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
-
- :param sample_size: integer, the number of instances in each sample
- :param repeats: the number of samples to generate
- :return: yield instances of :class:`LabelledCollection`
- """
- for _ in range(repeats):
- yield self.uniform_sampling(sample_size)
-
- def natural_sampling_index_generator(self, sample_size, repeats=100):
- """
- A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
- samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
-
- :param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
- :param repeats: the number of indexes to generate
- :return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
- """
- for _ in range(repeats):
- yield self.uniform_sampling_index(sample_size)
-
def __add__(self, other):
"""
Returns a new :class:`LabelledCollection` as the union of this collection with another collection
diff --git a/quapy/functional.py b/quapy/functional.py
index a8b17f6..e42d743 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -4,36 +4,6 @@ import scipy
import numpy as np
-def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False):
- """
- Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
- number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
- `n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
- valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
- valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
- implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
- to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).
-
- :param dimensions: the number of classes
- :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the grid
- (default is 21)
- :param repeat: number of copies for each valid prevalence vector (default is 1)
- :param return_constrained_dim: set to True to return all dimensions, or to False (default) for ommitting the
- constrained dimension
- :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape `(n, dimensions-1)`
- if `return_constrained_dim=False`, where `n` is the number of valid combinations found in the grid multiplied
- by `repeat`
- """
- s = np.linspace(0., 1., n_prevalences, endpoint=True)
- s = [s] * (dimensions - 1)
- prevs = [p for p in itertools.product(*s, repeat=1) if sum(p)<=1]
- if return_constrained_dim:
- prevs = [p+(1-sum(p),) for p in prevs]
- prevs = np.asarray(prevs).reshape(len(prevs), -1)
- if repeat>1:
- prevs = np.repeat(prevs, repeat, axis=0)
- return prevs
-
def prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01):
"""
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 5af4b2f..eef811b 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -21,7 +21,7 @@ class GridSearchQ(BaseQuantifier):
:param model: the quantifier to optimize
:type model: BaseQuantifier
:param param_grid: a dictionary with keys the parameter names and values the list of values to explore
- :param sample_size: the size of the samples to extract from the validation set (ignored if protocl='gen')
+ :param sample_size: the size of the samples to extract from the validation set (ignored if protocol='gen')
:param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
protocol, or 'gen' for using a custom sampling generator function
:param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
From be7a126c9492969a41e925507e4d67a7bcda7552 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Thu, 7 Apr 2022 16:48:31 +0200
Subject: [PATCH 02/67] update todo things
---
TODO.txt | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/TODO.txt b/TODO.txt
index 8a674a7..c20e901 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,7 +1,11 @@
+sample_size should not be mandatory when qp.environ['SAMPLE_SIZE'] has been specified
+clean all the cumbersome methods that have to be implemented for new quantifiers (e.g., n_classes_ prop, etc.)
+make truly parallel the GridSearchQ
+abstract protocols
+
Packaging:
==========================================
-Documentation with sphinx
Document methods with paper references
unit-tests
clean wiki_examples!
From fd339839a541bb13eea4462423acbd3d4e101eef Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Thu, 19 May 2022 13:43:32 +0200
Subject: [PATCH 03/67] removing redundant code
---
quapy/method/aggregative.py | 8 --------
1 file changed, 8 deletions(-)
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 19969c6..bb71525 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -442,14 +442,6 @@ class PACC(AggregativeProbabilisticQuantifier):
classes = val_data.classes_
self.pcc = PCC(self.learner)
-
- # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
- # document that belongs to yj ends up being classified as belonging to yi
- n_classes = len(classes)
- confusion = np.empty(shape=(n_classes, n_classes))
- for i, class_ in enumerate(classes):
- confusion[i] = y_[y == class_].mean(axis=0)
-
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self
From b453c8fcbc89a9c69718c0f82916b4292e573fd1 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 20 May 2022 16:48:46 +0200
Subject: [PATCH 04/67] first commit protocols
---
quapy/functional.py | 21 ++++
quapy/newprotocol.py | 244 +++++++++++++++++++++++++++++++++++++++++++
quapy/protocol.py | 179 +++++++++++++++++++++++++++++++
3 files changed, 444 insertions(+)
create mode 100644 quapy/newprotocol.py
create mode 100644 quapy/protocol.py
diff --git a/quapy/functional.py b/quapy/functional.py
index e42d743..215d89f 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -239,3 +239,24 @@ def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repe
else:
n_prevpoints += 1
+
+def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
+ """
+ Checks that p is a valid prevalence vector, i.e., that it contains values in [0,1] and that the values sum up to 1.
+ :param p: the prevalence vector to check
+ :return: True if `p` is valid, False otherwise
+ """
+ p = np.asarray(p)
+ if not all(p>=0):
+ if raise_exception:
+ raise ValueError('the prevalence vector contains negative numbers')
+ return False
+ if not all(p<=1):
+ if raise_exception:
+ raise ValueError('the prevalence vector contains values >1')
+ return False
+ if not np.isclose(p.sum(), 1, atol=toleranze):
+ if raise_exception:
+ raise ValueError('the prevalence vector does not sum up to 1')
+ return False
+ return True
diff --git a/quapy/newprotocol.py b/quapy/newprotocol.py
new file mode 100644
index 0000000..799f79b
--- /dev/null
+++ b/quapy/newprotocol.py
@@ -0,0 +1,244 @@
+import itertools
+from collections.abc import Generator
+from contextlib import ExitStack
+from abc import ABCMeta, abstractmethod
+
+from quapy.data import LabelledCollection
+import quapy.functional as F
+
+
+# 0.1.7
+# change the LabelledCollection API (removing protocol-related samplings)
+# need to change the two references to the above in the wiki / doc, and code examples...
+# removed artificial_prevalence_sampling from functional
+
+
+# class AbstractProtocol(metaclass=ABCMeta):
+# def __call__(self):
+# for g in self.gen():
+# yield g
+#
+# @abstractmethod
+# def gen(self):
+# ...
+
+
+class AbstractStochasticProtocol(metaclass=ABCMeta):
+ def __init__(self, seed=None):
+ self.random_seed = seed
+
+ @property
+ def random_seed(self):
+ return self._random_seed
+
+ @random_seed.setter
+ def random_seed(self, seed):
+ self._random_seed = seed
+
+ @abstractmethod
+ def samples_parameters(self):
+ """
+ This function has to return all the necessary parameters to replicate the samples
+ :return: a list of parameters, each of which serves to deterministically generate a sample
+ """
+ ...
+
+ @abstractmethod
+ def sample(self, params):
+ """
+ Extract one sample determined by the given parameters
+
+ :param params: all the necessary parameters to generate a sample
+ :return: one sample (the same sample has to be generated for the same parameters)
+ """
+ ...
+
+ def __call__(self):
+ with ExitStack() as stack:
+ if self.random_seed is not None:
+ stack.enter_context(qp.util.temp_seed(self.random_seed))
+ for params in self.samples_parameters():
+ yield self.sample(params)
+
+
+class APP(AbstractStochasticProtocol):
+ """
+ Implementation of the artificial prevalence protocol (APP).
+ The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
+ [0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
+ prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
+ [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
+ combination of prevalence values is indicated by `repeats`.
+
+ :param sample_size: integer, number of instances in each sample
+ :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
+ grid (default is 21)
+ :param repeats: number of copies for each valid prevalence vector (default is 1)
+ :param random_seed: allows replicating samples across runs (default None)
+ """
+
+ def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=1, random_seed=None):
+ super(APP, self).__init__(random_seed)
+ self.data = data
+ self.sample_size = sample_size
+ self.n_prevalences = n_prevalences
+ self.repeats = repeats
+
+ def prevalence_grid(self, dimensions):
+ """
+ Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
+ number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
+ `n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
+ valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
+ valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
+ implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
+ to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
+ 1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
+
+ :param dimensions: the number of classes
+ :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
+ `(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
+ in the grid multiplied by `repeat`
+ """
+ s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
+ s = [s] * (dimensions - 1)
+ prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
+ prevs = np.asarray(prevs).reshape(len(prevs), -1)
+ if self.repeats > 1:
+ prevs = np.repeat(prevs, self.repeats, axis=0)
+ return prevs
+
+ def samples_parameters(self):
+ indexes = []
+ for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
+ index = data.sampling_index(self.sample_size, *prevs)
+ indexes.append(index)
+ return indexes
+
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+
+class NPP(AbstractStochasticProtocol):
+ """
+ A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
+ samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
+
+ :param sample_size: integer, the number of instances in each sample
+ :param repeats: the number of samples to generate
+ """
+
+ def __init__(self, data:LabelledCollection, sample_size, repeats=1, random_seed=None):
+ super(NPP, self).__init__(random_seed)
+ self.data = data
+ self.sample_size = sample_size
+ self.repeats = repeats
+ self.random_seed = random_seed
+
+ def samples_parameters(self):
+ indexes = []
+ for _ in range(self.repeats):
+ index = data.uniform_sampling_index(self.sample_size)
+ indexes.append(index)
+ return indexes
+
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+
+class USimplexPP(AbstractStochasticProtocol):
+
+ def __init__(self, data: LabelledCollection, sample_size, repeats=1, random_seed=None):
+ super(USimplexPP, self).__init__(random_seed)
+ self.data = data
+ self.sample_size = sample_size
+ self.repeats = repeats
+ self.random_seed = random_seed
+
+ def samples_parameters(self):
+ indexes = []
+ for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
+ index = data.sampling_index(self.sample_size, *prevs)
+ indexes.append(index)
+ return indexes
+
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+
+class CovariateShift(AbstractStochasticProtocol):
+ """
+ Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
+
+ :param domainA:
+ :param domainB:
+ :param sample_size:
+ :param repeats:
+ :param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing
+ one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence
+ will be taken from the domain A (default).
+ :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
+ generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
+ the specific points
+ :param random_seed:
+ """
+
+ def __init__(
+ self,
+ domainA: LabelledCollection,
+ domainB: LabelledCollection,
+ sample_size,
+ repeats=1,
+ prevalence=None,
+ mixture_points=11,
+ random_seed=None):
+ super(CovariateShift, self).__init__(random_seed)
+ self.data = data
+ self.sample_size = sample_size
+ self.repeats = repeats
+ if prevalence is None:
+ self.prevalence = domainA.prevalence()
+ else:
+ self.prevalence = np.asarray(prevalence)
+ assert len(self.prevalence) == domainA.n_classes, \
+ f'wrong shape for the vector prevalence (expected {domainA.n_classes})'
+ assert F.check_prevalence_vector(self.prevalence), \
+ f'the prevalence vector is not valid (either it contains values outside [0,1] or does not sum up to 1)'
+ assert isinstance(mixture_points, int) or
+ self.random_seed = random_seed
+
+ def samples_parameters(self):
+ indexes = []
+ for _ in range(self.repeats):
+ index = data.uniform_sampling_index(self.sample_size)
+ indexes.append(index)
+ return indexes
+
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+
+if __name__=='__main__':
+ import numpy as np
+ import quapy as qp
+
+ y = [0]*25 + [1]*25 + [2]*25 + [3]*25
+ X = [str(i)+'-'+str(yi) for i, yi in enumerate(y)]
+
+ data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
+
+ # p=CounterExample(1, 8, 10, 5)
+
+ # p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
+ # p = NPP(data, sample_size=10, repeats=10, random_seed=42)
+ # p = NPP(data, sample_size=10, repeats=10)
+ p = USimplexPP(data, sample_size=10, repeats=10)
+
+ for _ in range(2):
+ print('init generator', p.__class__.__name__)
+ for i in p():
+ # print(i)
+ print(i.instances, i.labels, i.prevalence())
+
+ print('done')
+
diff --git a/quapy/protocol.py b/quapy/protocol.py
new file mode 100644
index 0000000..99f2522
--- /dev/null
+++ b/quapy/protocol.py
@@ -0,0 +1,179 @@
+import itertools
+from collections.abc import Generator
+from contextlib import ExitStack
+from abc import ABCMeta, abstractmethod
+
+from quapy.data import LabelledCollection
+import quapy.functional as F
+
+
+# 0.1.7
+# change the LabelledCollection API (removing protocol-related samplings)
+# need to change the two references to the above in the wiki / doc, and code examples...
+# removed artificial_prevalence_sampling from functional
+
+
+class NewAbstractProtocol(metaclass=Generator):
+ @abstractmethod
+ def send(self, value):
+ """Send a value into the generator.
+ Return next yielded value or raise StopIteration.
+ """
+ raise StopIteration
+
+ @abstractmethod
+ def throw(self, typ, val=None, tb=None):
+ """Raise an exception in the generator.
+ Return next yielded value or raise StopIteration.
+ """
+ if val is None:
+ if tb is None:
+ raise typ
+ val = typ()
+ if tb is not None:
+ val = val.with_traceback(tb)
+ raise val
+
+
+
+class AbstractProtocol(metaclass=ABCMeta):
+ """
+ Abstract class for sampling protocols.
+ A sampling protocol defines how to generate samples out of some dataset.
+ """
+
+ def __call__(self):
+ """
+ A generator that yields one sample at each iteration
+
+ :return: yield one sample (instance of :class:`quapy.data.LabelledCollection`) at each iteration
+ """
+ for index in self.indexes(data):
+ yield data.sampling_from_index(index)
+
+ def indexes(self, data: LabelledCollection):
+ """
+ A generator that yields one sample index at each iteration.
+ (This function is mainly a generic decorator that sets, if requested, the local random seed; the real
+ sampling is implemented by :meth:`_indexes`.)
+
+ :param data: the set of data from which samples' indexes are to be drawn
+ :return: one sample index (instance of `np.ndarray`) at each iteration
+ """
+ with ExitStack() as stack:
+ if self.random_seed is not None:
+ stack.enter_context(qp.util.temp_seed(self.random_seed))
+ for index in self._indexes(data):
+ yield index
+
+ @abstractmethod
+ def _indexes(self, data: LabelledCollection):
+ ...
+
+
+class APP(AbstractProtocol):
+ """
+ Implementation of the artificial prevalence protocol (APP).
+ The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
+ [0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
+ prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
+ [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
+ combination of prevalence values is indicated by `repeats`.
+
+ :param sample_size: integer, number of instances in each sample
+ :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
+ grid (default is 21)
+ :param repeats: number of copies for each valid prevalence vector (default is 1)
+ :param random_seed: allows replicating samples across runs (default None)
+ """
+
+ def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=1, random_seed=None):
+ self.data = data
+ self.sample_size = sample_size
+ self.n_prevalences = n_prevalences
+ self.repeats = repeats
+ self.random_seed = random_seed
+
+ def _indexes(self, data: LabelledCollection):
+ for prevs in self.prevalence_grid(dimensions=data.n_classes):
+ yield data.sampling_index(self.sample_size, *prevs)
+
+ def prevalence_grid(self, dimensions, return_constrained_dim=False):
+ """
+ Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
+ number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
+ `n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
+ valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
+ valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
+ implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
+ to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
+ 1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
+
+ :param dimensions: the number of classes
+ :param return_constrained_dim: set to True to return all dimensions, or to False (default) for ommitting the
+ constrained dimension
+ :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
+ `(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
+ in the grid multiplied by `repeat`
+ """
+ s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
+ s = [s] * (dimensions - 1)
+ prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
+ if return_constrained_dim:
+ prevs = [p + (1 - sum(p),) for p in prevs]
+ prevs = np.asarray(prevs).reshape(len(prevs), -1)
+ if self.repeats > 1:
+ prevs = np.repeat(prevs, self.repeats, axis=0)
+ return prevs
+
+
+class NPP(AbstractProtocol):
+ """
+ A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
+ samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
+
+ :param sample_size: integer, the number of instances in each sample
+ :param repeats: the number of samples to generate
+ """
+
+ def __init__(self, sample_size, repeats=1, random_seed=None):
+ self.sample_size = sample_size
+ self.repeats = repeats
+ self.random_seed = random_seed
+
+ def _indexes(self, data: LabelledCollection):
+ for _ in range(self.repeats):
+ yield data.uniform_sampling_index(self.sample_size)
+
+
+class USimplexPP(AbstractProtocol):
+
+ def __init__(self, sample_size, repeats=1, random_seed=None):
+ self.sample_size = sample_size
+ self.repeats = repeats
+ self.random_seed = random_seed
+
+ def _indexes(self, data: LabelledCollection):
+ for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
+ yield data.sampling_index(self.sample_size, *prevs)
+
+
+
+if __name__=='__main__':
+ import numpy as np
+ import quapy as qp
+
+ y = [0]*25 + [1]*25 + [2]*25 + [3]*25
+ X = [str(i)+'-'+str(yi) for i, yi in enumerate(y)]
+
+ data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
+
+ # p = APP(10, n_prevalences=11, random_seed=42)
+ # p = NPP(10, repeats=10, random_seed=42)
+ p = USimplexPP(10, repeats=10, random_seed=42)
+
+ for i in p(data):
+ print(i.instances, i.classes, i.prevalence())
+
+ print('done')
+
From 46e3632200e2e7d54814bd2b1a4d91c944f32a0a Mon Sep 17 00:00:00 2001
From: Alex Moreo
Date: Mon, 23 May 2022 00:20:08 +0200
Subject: [PATCH 05/67] ongoing protocols
---
quapy/{evaluation.py => depr_evaluation.py} | 0
quapy/newprotocol.py | 244 ----------------
quapy/protocol.py | 293 +++++++++++++++-----
3 files changed, 223 insertions(+), 314 deletions(-)
rename quapy/{evaluation.py => depr_evaluation.py} (100%)
delete mode 100644 quapy/newprotocol.py
diff --git a/quapy/evaluation.py b/quapy/depr_evaluation.py
similarity index 100%
rename from quapy/evaluation.py
rename to quapy/depr_evaluation.py
diff --git a/quapy/newprotocol.py b/quapy/newprotocol.py
deleted file mode 100644
index 799f79b..0000000
--- a/quapy/newprotocol.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import itertools
-from collections.abc import Generator
-from contextlib import ExitStack
-from abc import ABCMeta, abstractmethod
-
-from quapy.data import LabelledCollection
-import quapy.functional as F
-
-
-# 0.1.7
-# change the LabelledCollection API (removing protocol-related samplings)
-# need to change the two references to the above in the wiki / doc, and code examples...
-# removed artificial_prevalence_sampling from functional
-
-
-# class AbstractProtocol(metaclass=ABCMeta):
-# def __call__(self):
-# for g in self.gen():
-# yield g
-#
-# @abstractmethod
-# def gen(self):
-# ...
-
-
-class AbstractStochasticProtocol(metaclass=ABCMeta):
- def __init__(self, seed=None):
- self.random_seed = seed
-
- @property
- def random_seed(self):
- return self._random_seed
-
- @random_seed.setter
- def random_seed(self, seed):
- self._random_seed = seed
-
- @abstractmethod
- def samples_parameters(self):
- """
- This function has to return all the necessary parameters to replicate the samples
- :return: a list of parameters, each of which serves to deterministically generate a sample
- """
- ...
-
- @abstractmethod
- def sample(self, params):
- """
- Extract one sample determined by the given parameters
-
- :param params: all the necessary parameters to generate a sample
- :return: one sample (the same sample has to be generated for the same parameters)
- """
- ...
-
- def __call__(self):
- with ExitStack() as stack:
- if self.random_seed is not None:
- stack.enter_context(qp.util.temp_seed(self.random_seed))
- for params in self.samples_parameters():
- yield self.sample(params)
-
-
-class APP(AbstractStochasticProtocol):
- """
- Implementation of the artificial prevalence protocol (APP).
- The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
- [0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
- prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
- [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
- combination of prevalence values is indicated by `repeats`.
-
- :param sample_size: integer, number of instances in each sample
- :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
- grid (default is 21)
- :param repeats: number of copies for each valid prevalence vector (default is 1)
- :param random_seed: allows replicating samples across runs (default None)
- """
-
- def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=1, random_seed=None):
- super(APP, self).__init__(random_seed)
- self.data = data
- self.sample_size = sample_size
- self.n_prevalences = n_prevalences
- self.repeats = repeats
-
- def prevalence_grid(self, dimensions):
- """
- Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
- number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
- `n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
- valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
- valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
- implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
- to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
- 1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
-
- :param dimensions: the number of classes
- :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
- `(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
- in the grid multiplied by `repeat`
- """
- s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
- s = [s] * (dimensions - 1)
- prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
- prevs = np.asarray(prevs).reshape(len(prevs), -1)
- if self.repeats > 1:
- prevs = np.repeat(prevs, self.repeats, axis=0)
- return prevs
-
- def samples_parameters(self):
- indexes = []
- for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
- index = data.sampling_index(self.sample_size, *prevs)
- indexes.append(index)
- return indexes
-
- def sample(self, index):
- return self.data.sampling_from_index(index)
-
-
-class NPP(AbstractStochasticProtocol):
- """
- A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
- samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
-
- :param sample_size: integer, the number of instances in each sample
- :param repeats: the number of samples to generate
- """
-
- def __init__(self, data:LabelledCollection, sample_size, repeats=1, random_seed=None):
- super(NPP, self).__init__(random_seed)
- self.data = data
- self.sample_size = sample_size
- self.repeats = repeats
- self.random_seed = random_seed
-
- def samples_parameters(self):
- indexes = []
- for _ in range(self.repeats):
- index = data.uniform_sampling_index(self.sample_size)
- indexes.append(index)
- return indexes
-
- def sample(self, index):
- return self.data.sampling_from_index(index)
-
-
-class USimplexPP(AbstractStochasticProtocol):
-
- def __init__(self, data: LabelledCollection, sample_size, repeats=1, random_seed=None):
- super(USimplexPP, self).__init__(random_seed)
- self.data = data
- self.sample_size = sample_size
- self.repeats = repeats
- self.random_seed = random_seed
-
- def samples_parameters(self):
- indexes = []
- for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
- index = data.sampling_index(self.sample_size, *prevs)
- indexes.append(index)
- return indexes
-
- def sample(self, index):
- return self.data.sampling_from_index(index)
-
-
-class CovariateShift(AbstractStochasticProtocol):
- """
- Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
-
- :param domainA:
- :param domainB:
- :param sample_size:
- :param repeats:
- :param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing
- one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence
- will be taken from the domain A (default).
- :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
- generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
- the specific points
- :param random_seed:
- """
-
- def __init__(
- self,
- domainA: LabelledCollection,
- domainB: LabelledCollection,
- sample_size,
- repeats=1,
- prevalence=None,
- mixture_points=11,
- random_seed=None):
- super(CovariateShift, self).__init__(random_seed)
- self.data = data
- self.sample_size = sample_size
- self.repeats = repeats
- if prevalence is None:
- self.prevalence = domainA.prevalence()
- else:
- self.prevalence = np.asarray(prevalence)
- assert len(self.prevalence) == domainA.n_classes, \
- f'wrong shape for the vector prevalence (expected {domainA.n_classes})'
- assert F.check_prevalence_vector(self.prevalence), \
- f'the prevalence vector is not valid (either it contains values outside [0,1] or does not sum up to 1)'
- assert isinstance(mixture_points, int) or
- self.random_seed = random_seed
-
- def samples_parameters(self):
- indexes = []
- for _ in range(self.repeats):
- index = data.uniform_sampling_index(self.sample_size)
- indexes.append(index)
- return indexes
-
- def sample(self, index):
- return self.data.sampling_from_index(index)
-
-
-if __name__=='__main__':
- import numpy as np
- import quapy as qp
-
- y = [0]*25 + [1]*25 + [2]*25 + [3]*25
- X = [str(i)+'-'+str(yi) for i, yi in enumerate(y)]
-
- data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
-
- # p=CounterExample(1, 8, 10, 5)
-
- # p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
- # p = NPP(data, sample_size=10, repeats=10, random_seed=42)
- # p = NPP(data, sample_size=10, repeats=10)
- p = USimplexPP(data, sample_size=10, repeats=10)
-
- for _ in range(2):
- print('init generator', p.__class__.__name__)
- for i in p():
- # print(i)
- print(i.instances, i.labels, i.prevalence())
-
- print('done')
-
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 99f2522..43bb0ef 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -1,3 +1,4 @@
+import numpy as np
import itertools
from collections.abc import Generator
from contextlib import ExitStack
@@ -5,6 +6,7 @@ from abc import ABCMeta, abstractmethod
from quapy.data import LabelledCollection
import quapy.functional as F
+from tqdm import tqdm
# 0.1.7
@@ -12,66 +14,92 @@ import quapy.functional as F
# need to change the two references to the above in the wiki / doc, and code examples...
# removed artificial_prevalence_sampling from functional
+# maybe add some parameters in the init of the protocols (or maybe only for IndexableWhateverProtocols
+# indicating that the protocol should return indexes, and not samples themselves?
+# also: some parameters in the init could be used to indicate that the method should return a tuple with
+# unlabelled instances and the vector of prevalence values (and not a LabelledCollection).
+# Or: this can be done in a different function; i.e., we use one function (now __call__) to return
+# LabelledCollections, and another new one for returning the other output, which is more general for
+# evaluation purposes.
-class NewAbstractProtocol(metaclass=Generator):
- @abstractmethod
- def send(self, value):
- """Send a value into the generator.
- Return next yielded value or raise StopIteration.
- """
- raise StopIteration
-
- @abstractmethod
- def throw(self, typ, val=None, tb=None):
- """Raise an exception in the generator.
- Return next yielded value or raise StopIteration.
- """
- if val is None:
- if tb is None:
- raise typ
- val = typ()
- if tb is not None:
- val = val.with_traceback(tb)
- raise val
-
+# the so-called "gen" function has to be implemented as a protocol. The problem here is that this function
+# should be able to return only unlabelled instances plus a vector of prevalences (and not LabelledCollections).
+# This was coded as different functions in 0.1.6
class AbstractProtocol(metaclass=ABCMeta):
- """
- Abstract class for sampling protocols.
- A sampling protocol defines how to generate samples out of some dataset.
- """
+ @abstractmethod
def __call__(self):
"""
- A generator that yields one sample at each iteration
+ Implements the protocol. Yields one sample at a time
- :return: yield one sample (instance of :class:`quapy.data.LabelledCollection`) at each iteration
+ :return: yields one sample at a time
"""
- for index in self.indexes(data):
- yield data.sampling_from_index(index)
+ ...
- def indexes(self, data: LabelledCollection):
+ def total(self):
"""
- A generator that yields one sample index at each iteration.
- (This function is mainly a generic decorator that sets, if requested, the local random seed; the real
- sampling is implemented by :meth:`_indexes`.)
+ Indicates the total number of samples that the protocol generates.
- :param data: the set of data from which samples' indexes are to be drawn
- :return: one sample index (instance of `np.ndarray`) at each iteration
+ :return: The number of samples to generate if known, or `None` otherwise.
"""
+ return None
+
+
+class AbstractStochasticSeededProtocol(AbstractProtocol):
+ """
+ An AbstractStochasticSeededProtocol is a protocol that generates, via any random procedure (e.g.,
+ via random sapling), sequences of `LabelledCollection` samples. The protocol abstraction enforces
+ the object to be instantiated using a seed, so that the sequence can be completely replicated.
+ In order to make this functionality possible, the classes extending this abstraction need to
+ implement only two functions, :meth:`samples_parameters` which generates all the parameters
+ needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
+ deterministically generates a sample.
+
+ :param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
+ the sequence will be different every time the protocol is called.
+ """
+
+ def __init__(self, seed=None):
+ self.random_seed = seed
+
+ @property
+ def random_seed(self):
+ return self._random_seed
+
+ @random_seed.setter
+ def random_seed(self, seed):
+ self._random_seed = seed
+
+ @abstractmethod
+ def samples_parameters(self):
+ """
+ This function has to return all the necessary parameters to replicate the samples
+
+ :return: a list of parameters, each of which serves to deterministically generate a sample
+ """
+ ...
+
+ @abstractmethod
+ def sample(self, params):
+ """
+ Extract one sample determined by the given parameters
+
+ :param params: all the necessary parameters to generate a sample
+ :return: one sample (the same sample has to be generated for the same parameters)
+ """
+ ...
+
+ def __call__(self):
with ExitStack() as stack:
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
- for index in self._indexes(data):
- yield index
-
- @abstractmethod
- def _indexes(self, data: LabelledCollection):
- ...
+ for params in self.samples_parameters():
+ yield self.sample(params)
-class APP(AbstractProtocol):
+class APP(AbstractStochasticSeededProtocol):
"""
Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
@@ -80,25 +108,22 @@ class APP(AbstractProtocol):
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
combination of prevalence values is indicated by `repeats`.
+ :param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, number of instances in each sample
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21)
- :param repeats: number of copies for each valid prevalence vector (default is 1)
+ :param repeats: number of copies for each valid prevalence vector (default is 10)
:param random_seed: allows replicating samples across runs (default None)
"""
- def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=1, random_seed=None):
+ def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None):
+ super(APP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
self.repeats = repeats
- self.random_seed = random_seed
- def _indexes(self, data: LabelledCollection):
- for prevs in self.prevalence_grid(dimensions=data.n_classes):
- yield data.sampling_index(self.sample_size, *prevs)
-
- def prevalence_grid(self, dimensions, return_constrained_dim=False):
+ def prevalence_grid(self, dimensions):
"""
Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
@@ -110,8 +135,6 @@ class APP(AbstractProtocol):
1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
:param dimensions: the number of classes
- :param return_constrained_dim: set to True to return all dimensions, or to False (default) for ommitting the
- constrained dimension
:return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
`(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
in the grid multiplied by `repeat`
@@ -119,43 +142,163 @@ class APP(AbstractProtocol):
s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
s = [s] * (dimensions - 1)
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
- if return_constrained_dim:
- prevs = [p + (1 - sum(p),) for p in prevs]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
if self.repeats > 1:
prevs = np.repeat(prevs, self.repeats, axis=0)
return prevs
+ def samples_parameters(self):
+ indexes = []
+ for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
+ index = data.sampling_index(self.sample_size, *prevs)
+ indexes.append(index)
+ return indexes
-class NPP(AbstractProtocol):
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+ def total(self):
+ return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
+
+
+class NPP(AbstractStochasticSeededProtocol):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
+ :param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample
- :param repeats: the number of samples to generate
+ :param repeats: the number of samples to generate. Default is 100.
+ :param random_seed: allows replicating samples across runs (default None)
"""
- def __init__(self, sample_size, repeats=1, random_seed=None):
+ def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None):
+ super(NPP, self).__init__(random_seed)
+ self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
- def _indexes(self, data: LabelledCollection):
+ def samples_parameters(self):
+ indexes = []
for _ in range(self.repeats):
- yield data.uniform_sampling_index(self.sample_size)
+ index = data.uniform_sampling_index(self.sample_size)
+ indexes.append(index)
+ return indexes
+
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+ def total(self):
+ return self.repeats
-class USimplexPP(AbstractProtocol):
- def __init__(self, sample_size, repeats=1, random_seed=None):
+class USimplexPP(AbstractStochasticSeededProtocol):
+ """
+ A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
+ relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
+ k the number of classes. This protocol covers the entire range of prevalence values in a
+ statistical sense, i.e., unlike APP there is no guarantee that it is covered precisely
+ equally for all classes, but it is preferred in cases in which the number of possible
+ combinations of the grid values of APP makes this endeavour intractable.
+
+ :param data: a `LabelledCollection` from which the samples will be drawn
+ :param sample_size: integer, the number of instances in each sample
+ :param repeats: the number of samples to generate. Default is 100.
+ :param random_seed: allows replicating samples across runs (default None)
+ """
+
+ def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None):
+ super(USimplexPP, self).__init__(random_seed)
+ self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
- def _indexes(self, data: LabelledCollection):
+ def samples_parameters(self):
+ indexes = []
for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
- yield data.sampling_index(self.sample_size, *prevs)
+ index = data.sampling_index(self.sample_size, *prevs)
+ indexes.append(index)
+ return indexes
+
+ def sample(self, index):
+ return self.data.sampling_from_index(index)
+
+ def total(self):
+ return self.repeats
+
+
+
+class CovariateShiftPP(AbstractStochasticSeededProtocol):
+ """
+ Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
+
+ :param domainA:
+ :param domainB:
+ :param sample_size:
+ :param repeats:
+ :param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing
+ one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence
+ will be taken from the domain A (default).
+ :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
+ generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
+ the specific points
+ :param random_seed:
+ """
+
+ def __init__(
+ self,
+ domainA: LabelledCollection,
+ domainB: LabelledCollection,
+ sample_size,
+ repeats=1,
+ prevalence=None,
+ mixture_points=11,
+ random_seed=None):
+ super(CovariateShiftPP, self).__init__(random_seed)
+ self.A = domainA
+ self.B = domainB
+ self.sample_size = sample_size
+ self.repeats = repeats
+ if prevalence is None:
+ self.prevalence = domainA.prevalence()
+ else:
+ self.prevalence = np.asarray(prevalence)
+ assert len(self.prevalence) == domainA.n_classes, \
+ f'wrong shape for the vector prevalence (expected {domainA.n_classes})'
+ assert F.check_prevalence_vector(self.prevalence), \
+ f'the prevalence vector is not valid (either it contains values outside [0,1] or does not sum up to 1)'
+ if isinstance(mixture_points, int):
+ self.mixture_points = np.linspace(0, 1, mixture_points)[::-1]
+ else:
+ self.mixture_points = np.asarray(mixture_points)
+ assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
+ 'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
+ self.random_seed = random_seed
+
+ def samples_parameters(self):
+ indexesA, indexesB = [], []
+ for propA in self.mixture_points:
+ for _ in range(self.repeats):
+ nA = int(np.round(self.sample_size * propA))
+ nB = self.sample_size-nA
+ sampleAidx = self.A.sampling_index(nA, *self.prevalence)
+ sampleBidx = self.B.sampling_index(nB, *self.prevalence)
+ indexesA.append(sampleAidx)
+ indexesB.append(sampleBidx)
+ return list(zip(indexesA, indexesB))
+
+ def sample(self, indexes):
+ indexesA, indexesB = indexes
+ sampleA = self.A.sampling_from_index(indexesA)
+ sampleB = self.B.sampling_from_index(indexesB)
+ return sampleA+sampleB
+
+ def total(self):
+ return self.repeats * len(self.mixture_points)
+
@@ -163,17 +306,27 @@ if __name__=='__main__':
import numpy as np
import quapy as qp
+ # domainA
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
- X = [str(i)+'-'+str(yi) for i, yi in enumerate(y)]
-
+ X = ['A:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
- # p = APP(10, n_prevalences=11, random_seed=42)
- # p = NPP(10, repeats=10, random_seed=42)
- p = USimplexPP(10, repeats=10, random_seed=42)
+ # domain B
+ y = [0]*25 + [1]*25 + [2]*25 + [3]*25
+ X = ['B:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
+ dataB = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
- for i in p(data):
- print(i.instances, i.classes, i.prevalence())
+ # p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
+ # p = NPP(data, sample_size=10, repeats=10, random_seed=42)
+ # p = NPP(data, sample_size=10, repeats=10)
+ # p = USimplexPP(data, sample_size=10, repeats=10)
+ p = CovariateShiftPP(data, dataB, sample_size=10, mixture_points=11, random_seed=1)
+
+ for _ in range(2):
+ print('init generator', p.__class__.__name__)
+ for i in tqdm(p(), total=p.total()):
+ # print(i)
+ print(i.instances, i.labels, i.prevalence())
print('done')
From 4bc9d196358b6b3722eb245549317721f6fd1fb9 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Wed, 25 May 2022 19:14:33 +0200
Subject: [PATCH 06/67] many changes, see change log
---
quapy/CHANGE_LOG.txt | 34 +++
quapy/__init__.py | 7 +-
quapy/data/base.py | 20 +-
quapy/evaluation.py | 102 +++++++++
quapy/method/aggregative.py | 373 +++++++++++++++++----------------
quapy/method/base.py | 88 --------
quapy/method/meta.py | 13 --
quapy/method/neural.py | 2 +-
quapy/model_selection.py | 206 ++++++------------
quapy/protocol.py | 96 +++++----
quapy/tests/test_evaluation.py | 57 +++++
quapy/tests/test_hierarchy.py | 32 +++
quapy/tests/test_modsel.py | 77 +++++++
quapy/tests/test_protocols.py | 139 ++++++++++++
14 files changed, 754 insertions(+), 492 deletions(-)
create mode 100644 quapy/CHANGE_LOG.txt
create mode 100644 quapy/evaluation.py
create mode 100644 quapy/tests/test_evaluation.py
create mode 100644 quapy/tests/test_hierarchy.py
create mode 100644 quapy/tests/test_modsel.py
create mode 100644 quapy/tests/test_protocols.py
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
new file mode 100644
index 0000000..a372109
--- /dev/null
+++ b/quapy/CHANGE_LOG.txt
@@ -0,0 +1,34 @@
+# main changes in 0.1.7
+
+- Protocols is now an abstraction, AbstractProtocol. There is a new class extending AbstractProtocol called
+ AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings.
+ There are some examples of protocols, APP, NPP, USimplexPP, CovariateShiftPP (experimental).
+ The idea is to start the sampling by simpli calling the __call__ method.
+ This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection,
+ and sampling functions in LabelledCollection make use of the old functions.
+
+- ACC, PACC, Forman's threshold variants have been parallelized.
+
+
+Things to fix:
+- eval budget policy?
+- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
+- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
+ internally and not imposed in any abstract class)
+- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
+- update unit tests
+- Policies should be able to set their output to "labelled_collection" or "instances_prevalence" or something similar.
+- Policies should implement the "gen()" one, taking a reader function as an input, and a folder path maybe
+- Review all documentation, redo the Sphinx doc, update Wikis...
+- Resolve the OneVsAll thing (it is in base.py and in aggregative.py
+- Better handle the environment (e.g., with n_jobs)
+- test cross_generate_predictions and cancel cross_generate_predictions_depr
+- Add a proper log?
+- test LoadSamplesFromDirectory (in protocols.py)
+- improve plots?
+- I have removed the distinction between "classify" and "posterior_probabilities" in the Aggregative quantifiers,
+ so that probabilistic classifiers actually return posterior probabilities, while non-probabilistic quantifiers
+ return instead crisp decisions. The idea was to unify the quantification function (i.e., now it is always
+ classify & aggregate, irrespective of the class). However, this has caused a problem with OneVsAll. This has to
+ be checked, since it is now innecessarily complicated (it also has old references to .probabilistic, and all this
+ stuff).
\ No newline at end of file
diff --git a/quapy/__init__.py b/quapy/__init__.py
index ad69ae9..2ef4c5c 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -2,13 +2,13 @@ from . import error
from . import data
from quapy.data import datasets
from . import functional
-from . import method
+# from . import method
from . import evaluation
+from . import protocol
from . import plot
from . import util
from . import model_selection
from . import classification
-from quapy.method.base import isprobabilistic, isaggregative
__version__ = '0.1.7'
@@ -21,5 +21,4 @@ environ = {
'SVMPERF_HOME': './svm_perf_quantification'
}
-def isbinary(x):
- return x.binary
\ No newline at end of file
+
diff --git a/quapy/data/base.py b/quapy/data/base.py
index cfe2891..c555692 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -210,10 +210,12 @@ class LabelledCollection:
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
second one with `1-train_prop` elements
"""
- tr_docs, te_docs, tr_labels, te_labels = \
- train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
- random_state=random_state)
- return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
+ tr_docs, te_docs, tr_labels, te_labels = train_test_split(
+ self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state
+ )
+ training = LabelledCollection(tr_docs, tr_labels, classes_=self.classes_)
+ test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
+ return training, test
def __add__(self, other):
"""
@@ -418,13 +420,3 @@ class Dataset:
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
-def isbinary(data):
- """
- Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
-
- :param data: a :class:`Dataset` or a :class:`LabelledCollection` object
- :return: True if labelled according to two classes
- """
- if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
- return data.binary
- return False
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
new file mode 100644
index 0000000..0ea417d
--- /dev/null
+++ b/quapy/evaluation.py
@@ -0,0 +1,102 @@
+from typing import Union, Callable, Iterable
+import numpy as np
+from tqdm import tqdm
+import inspect
+import quapy as qp
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier
+from quapy.util import temp_seed
+import quapy.functional as F
+import pandas as pd
+
+
+def prediction(model: BaseQuantifier, protocol: AbstractProtocol, verbose=False):
+ sout = lambda x: print(x) if verbose else None
+ from method.aggregative import AggregativeQuantifier
+ if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
+ sout('speeding up the prediction for the aggregative quantifier')
+ pre_classified = model.classify(protocol.get_labelled_collection().instances)
+ return __prediction_helper(model.aggregate, protocol.on_preclassified_instances(pre_classified), verbose)
+ else:
+ sout(f'the method is not aggregative, or the protocol is not an instance of '
+ f'{OnLabelledCollectionProtocol.__name__}, so no optimization can be carried out')
+ return __prediction_helper(model.quantify, protocol, verbose)
+
+
+def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
+ true_prevs, estim_prevs = [], []
+ for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
+ estim_prevs.append(quantification_fn(sample.instances))
+ true_prevs.append(sample.prevalence())
+
+ true_prevs = np.asarray(true_prevs)
+ estim_prevs = np.asarray(estim_prevs)
+
+ return true_prevs, estim_prevs
+
+
+def evaluation_report(model: BaseQuantifier,
+ protocol: AbstractProtocol,
+ error_metrics:Iterable[Union[str,Callable]]='mae',
+ verbose=False):
+
+ true_prevs, estim_prevs = prediction(model, protocol, verbose)
+ return _prevalence_report(true_prevs, estim_prevs, error_metrics)
+
+
+def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[str, Callable]] = 'mae'):
+
+ if isinstance(error_metrics, str):
+ error_metrics = [error_metrics]
+
+ error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
+ assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
+ error_names = [e.__name__ for e in error_funcs]
+
+ df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names)
+ for true_prev, estim_prev in zip(true_prevs, estim_prevs):
+ series = {'true-prev': true_prev, 'estim-prev': estim_prev}
+ for error_name, error_metric in zip(error_names, error_funcs):
+ score = error_metric(true_prev, estim_prev)
+ series[error_name] = score
+ df = df.append(series, ignore_index=True)
+
+ return df
+
+
+def evaluate(model: BaseQuantifier, protocol: AbstractProtocol, error_metric:Union[str, Callable], verbose=False):
+ if isinstance(error_metric, str):
+ error_metric = qp.error.from_name(error_metric)
+ true_prevs, estim_prevs = prediction(model, protocol, verbose)
+ return error_metric(true_prevs, estim_prevs)
+
+
+
+def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, repeats=1, verbose=False):
+ if n_prevpoints is None and eval_budget is None:
+ raise ValueError('either n_prevpoints or eval_budget has to be specified')
+ elif n_prevpoints is None:
+ assert eval_budget > 0, 'eval_budget must be a positive integer'
+ n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
+ eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+ if verbose:
+ print(f'setting n_prevpoints={n_prevpoints} so that the number of '
+ f'evaluations ({eval_computations}) does not exceed the evaluation '
+ f'budget ({eval_budget})')
+ elif eval_budget is None:
+ eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+ if verbose:
+ print(f'{eval_computations} evaluations will be performed for each '
+ f'combination of hyper-parameters')
+ else:
+ eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+ if eval_computations > eval_budget:
+ n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
+ new_eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
+ if verbose:
+ print(f'the budget of evaluations would be exceeded with '
+ f'n_prevpoints={n_prevpoints}. Chaning to n_prevpoints={n_prevpoints}. This will produce '
+ f'{new_eval_computations} evaluation computations for each hyper-parameter combination.')
+ return n_prevpoints, eval_computations
+
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index c0280a2..ea9cbc0 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -1,15 +1,13 @@
from abc import abstractmethod
from copy import deepcopy
from typing import Union
-
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
-
import quapy as qp
import quapy.functional as F
from quapy.classification.svmperf import SVMperf
@@ -61,7 +59,9 @@ class AggregativeQuantifier(BaseQuantifier):
def classify(self, instances):
"""
- Provides the label predictions for the given instances.
+ Provides the label predictions for the given instances. The predictions should respect the format expected by
+ :meth:`aggregate`, i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for
+ non-probabilistic quantifiers
:param instances: array-like
:return: np.ndarray of shape `(n_instances,)` with label predictions
@@ -118,16 +118,6 @@ class AggregativeQuantifier(BaseQuantifier):
"""
return self.learner.classes_
- @property
- def aggregative(self):
- """
- Returns True, indicating the quantifier is of type aggregative.
-
- :return: True
- """
-
- return True
-
class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
"""
@@ -137,28 +127,25 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
probabilities.
"""
- def posterior_probabilities(self, instances):
+ def classify(self, instances):
return self.learner.predict_proba(instances)
- def predict_proba(self, instances):
- return self.posterior_probabilities(instances)
-
- def quantify(self, instances):
- classif_posteriors = self.posterior_probabilities(instances)
- return self.aggregate(classif_posteriors)
-
def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV):
parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
self.learner.set_params(**parameters)
- @property
- def probabilistic(self):
- return True
-
# Helper
# ------------------------------------
+def _ensure_probabilistic(learner):
+ if not hasattr(learner, 'predict_proba'):
+ print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
+ f'The learner will be calibrated.')
+ learner = CalibratedClassifierCV(learner, cv=5)
+ return learner
+
+
def _training_helper(learner,
data: LabelledCollection,
fit_learner: bool = True,
@@ -180,10 +167,7 @@ def _training_helper(learner,
"""
if fit_learner:
if ensure_probabilistic:
- if not hasattr(learner, 'predict_proba'):
- print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
- f'The learner will be calibrated.')
- learner = CalibratedClassifierCV(learner, cv=5)
+ learner = _ensure_probabilistic(learner)
if val_split is not None:
if isinstance(val_split, float):
if not (0 < val_split < 1):
@@ -214,6 +198,89 @@ def _training_helper(learner,
return learner, unused
+def cross_generate_predictions(
+ data,
+ learner,
+ val_split,
+ probabilistic,
+ fit_learner,
+ n_jobs
+):
+
+ if isinstance(val_split, int):
+ assert fit_learner == True, \
+ 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
+
+ if probabilistic:
+ learner = _ensure_probabilistic(learner)
+ predict = 'predict_proba'
+ else:
+ predict = 'predict'
+ y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
+ class_count = data.counts()
+
+ # fit the learner on all data
+ learner.fit(*data.Xy)
+ classes = data.classes_
+ else:
+ learner, val_data = _training_helper(
+ learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
+ )
+ y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances)
+ y = val_data.labels
+ classes = val_data.classes_
+ class_count = val_data.counts()
+
+ return learner, y, y_pred, classes, class_count
+
+
+def cross_generate_predictions_depr(
+ data,
+ learner,
+ val_split,
+ probabilistic,
+ fit_learner,
+ method_name=''
+):
+ predict = learner.predict_proba if probabilistic else learner.predict
+ if isinstance(val_split, int):
+ assert fit_learner == True, \
+ 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
+ # kFCV estimation of parameters
+ y, y_ = [], []
+ kfcv = StratifiedKFold(n_splits=val_split)
+ pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
+ for k, (training_idx, validation_idx) in enumerate(pbar):
+ pbar.set_description(f'{method_name}\tfitting fold {k}')
+ training = data.sampling_from_index(training_idx)
+ validation = data.sampling_from_index(validation_idx)
+ learner, val_data = _training_helper(
+ learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation
+ )
+ y_.append(predict(val_data.instances))
+ y.append(val_data.labels)
+
+ y = np.concatenate(y)
+ y_ = np.concatenate(y_)
+ class_count = data.counts()
+
+ # fit the learner on all data
+ learner, _ = _training_helper(
+ learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None
+ )
+ classes = data.classes_
+
+ else:
+ learner, val_data = _training_helper(
+ learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
+ )
+ y_ = predict(val_data.instances)
+ y = val_data.labels
+ classes = val_data.classes_
+ class_count = val_data.counts()
+
+ return learner, y, y_, classes, class_count
+
# Methods
# ------------------------------------
class CC(AggregativeQuantifier):
@@ -264,9 +331,10 @@ class ACC(AggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
self.learner = learner
self.val_split = val_split
+ self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
"""
@@ -280,44 +348,33 @@ class ACC(AggregativeQuantifier):
cross validation to estimate the parameters
:return: self
"""
+
if val_split is None:
val_split = self.val_split
- if isinstance(val_split, int):
- assert fit_learner == True, \
- 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
- # kFCV estimation of parameters
- y, y_ = [], []
- kfcv = StratifiedKFold(n_splits=val_split)
- pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
- for k, (training_idx, validation_idx) in enumerate(pbar):
- pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
- training = data.sampling_from_index(training_idx)
- validation = data.sampling_from_index(validation_idx)
- learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
- y_.append(learner.predict(val_data.instances))
- y.append(val_data.labels)
- y = np.concatenate(y)
- y_ = np.concatenate(y_)
- class_count = data.counts()
-
- # fit the learner on all data
- self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
-
- else:
- self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
- y_ = self.learner.predict(val_data.instances)
- y = val_data.labels
- class_count = val_data.counts()
+ self.learner, y, y_, classes, class_count = cross_generate_predictions(
+ data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs
+ )
self.cc = CC(self.learner)
-
- # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
- # document that belongs to yj ends up being classified as belonging to yi
- self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
+ self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
return self
+ @classmethod
+ def getPteCondEstim(cls, classes, y, y_):
+ # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
+ # document that belongs to yj ends up being classified as belonging to yi
+ conf = confusion_matrix(y, y_, labels=classes).T
+ conf = conf.astype(np.float)
+ class_counts = conf.sum(axis=0)
+ for i, _ in enumerate(classes):
+ if class_counts[i] == 0:
+ conf[i, i] = 1
+ else:
+ conf[:, i] /= class_counts[i]
+ return conf
+
def classify(self, data):
return self.cc.classify(data)
@@ -380,9 +437,10 @@ class PACC(AggregativeProbabilisticQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
self.learner = learner
self.val_split = val_split
+ self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
"""
@@ -396,52 +454,31 @@ class PACC(AggregativeProbabilisticQuantifier):
to estimate the parameters
:return: self
"""
+
if val_split is None:
val_split = self.val_split
- if isinstance(val_split, int):
- assert fit_learner == True, \
- 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
- # kFCV estimation of parameters
- y, y_ = [], []
- kfcv = StratifiedKFold(n_splits=val_split)
- pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
- for k, (training_idx, validation_idx) in enumerate(pbar):
- pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
- training = data.sampling_from_index(training_idx)
- validation = data.sampling_from_index(validation_idx)
- learner, val_data = _training_helper(
- self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
- y_.append(learner.predict_proba(val_data.instances))
- y.append(val_data.labels)
-
- y = np.concatenate(y)
- y_ = np.vstack(y_)
-
- # fit the learner on all data
- self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
- val_split=None)
- classes = data.classes_
-
- else:
- self.learner, val_data = _training_helper(
- self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
- y_ = self.learner.predict_proba(val_data.instances)
- y = val_data.labels
- classes = val_data.classes_
+ self.learner, y, y_, classes, class_count = cross_generate_predictions(
+ data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
+ )
self.pcc = PCC(self.learner)
+ self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
+ return self
+
+ @classmethod
+ def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
n_classes = len(classes)
- confusion = np.empty(shape=(n_classes, n_classes))
+ confusion = np.eye(n_classes)
for i, class_ in enumerate(classes):
- confusion[i] = y_[y == class_].mean(axis=0)
+ idx = y == class_
+ if idx.any():
+ confusion[i] = y_[idx].mean(axis=0)
- self.Pte_cond_estim_ = confusion.T
-
- return self
+ return confusion.T
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
@@ -557,7 +594,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self._check_binary(data, self.__class__.__name__)
self.learner, validation = _training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
- Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
+ Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
# pre-compute the histogram for positive and negative examples
@@ -732,44 +769,24 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
self.learner = learner
self.val_split = val_split
+ self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
self._check_binary(data, "Threshold Optimization")
if val_split is None:
val_split = self.val_split
- if isinstance(val_split, int):
- assert fit_learner == True, \
- 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
- # kFCV estimation of parameters
- y, probabilities = [], []
- kfcv = StratifiedKFold(n_splits=val_split)
- pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
- for k, (training_idx, validation_idx) in enumerate(pbar):
- pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
- training = data.sampling_from_index(training_idx)
- validation = data.sampling_from_index(validation_idx)
- learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
- probabilities.append(learner.predict_proba(val_data.instances))
- y.append(val_data.labels)
- y = np.concatenate(y)
- probabilities = np.concatenate(probabilities)
-
- # fit the learner on all data
- self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
-
- else:
- self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
- probabilities = self.learner.predict_proba(val_data.instances)
- y = val_data.labels
+ self.learner, y, y_, classes, class_count = cross_generate_predictions(
+ data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
+ )
self.cc = CC(self.learner)
- self.tpr, self.fpr = self._optimize_threshold(y, probabilities)
+ self.tpr, self.fpr = self._optimize_threshold(y, y_)
return self
@@ -828,7 +845,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
def _compute_tpr(self, TP, FP):
if TP + FP == 0:
- return 0
+ return 1
return TP / (TP + FP)
def _compute_fpr(self, FP, TN):
@@ -1022,54 +1039,59 @@ class OneVsAll(AggregativeQuantifier):
def classify(self, instances):
"""
- Returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry
- `(i,j)` is a binary value indicating whether instance `i `belongs to class `j`. The binary classifications are
- independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
+ If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
+ instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
+ `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
+ can end up be attributed to 0, 1, or more classes.
+ If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
+ and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
+ posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
+ probabilities are independent of each other, meaning that, in general, they do not sum up to one.
:param instances: array-like
:return: `np.ndarray`
"""
- classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
- return classif_predictions_bin.T
-
- def posterior_probabilities(self, instances):
- """
- Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
- `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
- (resp. does not belong) to class `j`.
- The posterior probabilities are independent of each other, meaning that, in general, they do not sum
- up to one.
-
- :param instances: array-like
- :return: `np.ndarray`
- """
-
- if not self.binary_quantifier.probabilistic:
- raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
- f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
- f'probabilistic')
- posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
- return np.swapaxes(posterior_predictions_bin, 0, 1)
-
- def aggregate(self, classif_predictions_bin):
- if self.probabilistic:
- assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \
- 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
- 'probabilities (2 dimensions) for each document (row) and class (columns)'
+ classif_predictions = self.__parallel(self._delayed_binary_classification, instances)
+ if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
+ return np.swapaxes(classif_predictions, 0, 1)
else:
- assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \
- 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
- 'predictions for each document (row) and class (columns)'
- prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
+ return classif_predictions.T
+ #
+ # def posterior_probabilities(self, instances):
+ # """
+ # Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
+ # `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
+ # (resp. does not belong) to class `j`.
+ # The posterior probabilities are independent of each other, meaning that, in general, they do not sum
+ # up to one.
+ #
+ # :param instances: array-like
+ # :return: `np.ndarray`
+ # """
+ #
+ # if not isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
+ # raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
+ # f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
+ # f'probabilistic')
+ # posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
+ # return np.swapaxes(posterior_predictions_bin, 0, 1)
+
+ def aggregate(self, classif_predictions):
+ # if self.probabilistic:
+ # assert classif_predictions.shape[1] == self.n_classes and classif_predictions.shape[2] == 2, \
+ # 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
+ # 'probabilities (2 dimensions) for each document (row) and class (columns)'
+ # else:
+ # assert set(np.unique(classif_predictions)).issubset({0, 1}), \
+ # 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
+ # 'predictions for each document (row) and class (columns)'
+ prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions)
return F.normalize_prevalence(prevalences)
- def quantify(self, X):
- if self.probabilistic:
- predictions = self.posterior_probabilities(X)
- else:
- predictions = self.classify(X)
- return self.aggregate(predictions)
+ # def quantify(self, X):
+ # predictions = self.classify(X)
+ # return self.aggregate(predictions)
def __parallel(self, func, *args, **kwargs):
return np.asarray(
@@ -1093,9 +1115,6 @@ class OneVsAll(AggregativeQuantifier):
def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X)
- def _delayed_binary_posteriors(self, c, X):
- return self.dict_binary_quantifiers[c].posterior_probabilities(X)
-
def _delayed_binary_aggregate(self, c, classif_predictions):
# the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
@@ -1104,21 +1123,3 @@ class OneVsAll(AggregativeQuantifier):
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
self.dict_binary_quantifiers[c].fit(bindata)
- @property
- def binary(self):
- """
- Informs that the classifier is not binary
-
- :return: False
- """
- return False
-
- @property
- def probabilistic(self):
- """
- Indicates if the classifier is probabilistic or not (depending on the nature of the base classifier).
-
- :return: boolean
- """
-
- return self.binary_quantifier.probabilistic
diff --git a/quapy/method/base.py b/quapy/method/base.py
index 4a4962a..55e18c7 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -51,56 +51,6 @@ class BaseQuantifier(metaclass=ABCMeta):
"""
...
- @property
- @abstractmethod
- def classes_(self):
- """
- Class labels, in the same order in which class prevalence values are to be computed.
-
- :return: array-like
- """
- ...
-
- @property
- def n_classes(self):
- """
- Returns the number of classes
-
- :return: integer
- """
- return len(self.classes_)
-
- # these methods allows meta-learners to reimplement the decision based on their constituents, and not
- # based on class structure
- @property
- def binary(self):
- """
- Indicates whether the quantifier is binary or not.
-
- :return: False (to be overridden)
- """
- return False
-
- @property
- def aggregative(self):
- """
- Indicates whether the quantifier is of type aggregative or not
-
- :return: False (to be overridden)
- """
-
- return False
-
- @property
- def probabilistic(self):
- """
- Indicates whether the quantifier is of type probabilistic or not
-
- :return: False (to be overridden)
- """
-
- return False
-
class BinaryQuantifier(BaseQuantifier):
"""
@@ -112,46 +62,8 @@ class BinaryQuantifier(BaseQuantifier):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
- @property
- def binary(self):
- """
- Informs that the quantifier is binary
-
- :return: True
- """
- return True
-def isbinary(model:BaseQuantifier):
- """
- Alias for property `binary`
-
- :param model: the model
- :return: True if the model is binary, False otherwise
- """
- return model.binary
-
-
-def isaggregative(model:BaseQuantifier):
- """
- Alias for property `aggregative`
-
- :param model: the model
- :return: True if the model is aggregative, False otherwise
- """
-
- return model.aggregative
-
-
-def isprobabilistic(model:BaseQuantifier):
- """
- Alias for property `probabilistic`
-
- :param model: the model
- :return: True if the model is probabilistic, False otherwise
- """
-
- return model.probabilistic
# class OneVsAll:
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index 3504301..3e57652 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -234,19 +234,6 @@ class Ensemble(BaseQuantifier):
order = np.argsort(dist)
return _select_k(predictions, order, k=self.red_size)
- @property
- def classes_(self):
- return self.base_quantifier.classes_
-
- @property
- def binary(self):
- """
- Returns a boolean indicating whether the base quantifiers are binary or not
-
- :return: boolean
- """
- return self.base_quantifier.binary
-
@property
def aggregative(self):
"""
diff --git a/quapy/method/neural.py b/quapy/method/neural.py
index bf1f375..0665634 100644
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@@ -191,7 +191,7 @@ class QuaNetTrainer(BaseQuantifier):
label_predictions = np.argmax(posteriors, axis=-1)
prevs_estim = []
for quantifier in self.quantifiers.values():
- predictions = posteriors if quantifier.probabilistic else label_predictions
+ predictions = posteriors if isinstance(quantifier, AggregativeProbabilisticQuantifier) else label_predictions
prevs_estim.extend(quantifier.aggregate(predictions))
# there is no real need for adding static estims like the TPR or FPR from training since those are constant
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index eef811b..c1fa817 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -2,14 +2,12 @@ import itertools
import signal
from copy import deepcopy
from typing import Union, Callable
-
-import numpy as np
-
+import evaluation
import quapy as qp
+from protocol import AbstractProtocol, OnLabelledCollectionProtocol
from quapy.data.base import LabelledCollection
-from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
from quapy.method.aggregative import BaseQuantifier
-import inspect
+from time import time
class GridSearchQ(BaseQuantifier):
@@ -21,33 +19,11 @@ class GridSearchQ(BaseQuantifier):
:param model: the quantifier to optimize
:type model: BaseQuantifier
:param param_grid: a dictionary with keys the parameter names and values the list of values to explore
- :param sample_size: the size of the samples to extract from the validation set (ignored if protocol='gen')
- :param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
- protocol, or 'gen' for using a custom sampling generator function
- :param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
- [0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
- each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested.
- Ignored if protocol!='app'.
- :param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
- for the protocol='app' if eval_budget is set and is lower than the number of combinations that would be
- generated using the value assigned to n_prevpoints (for the current number of classes and n_repetitions).
- Ignored for protocol='npp' and protocol='gen' (use eval_budget for setting a maximum number of samples in
- those cases).
- :param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
- combination. For example, if protocol='app', there are 3 classes, n_repetitions=1 and eval_budget=20, then
- n_prevpoints will be set to 5, since this will generate 15 different prevalences, i.e., [0, 0, 1],
- [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0], and since setting it to 6 would generate more than
- 20. When protocol='gen', indicates the maximum number of samples to generate, but less samples will be
- generated if the generator yields less samples.
+ :param protocol:
:param error: an error function (callable) or a string indicating the name of an error function (valid ones
are those in qp.error.QUANTIFICATION_ERROR
:param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
the best chosen hyperparameter combination. Ignored if protocol='gen'
- :param val_split: either a LabelledCollection on which to test the performance of the different settings, or
- a float in [0,1] indicating the proportion of labelled data to extract from the training set, or a callable
- returning a generator function each time it is invoked (only for protocol='gen').
- :param n_jobs: number of parallel jobs
- :param random_seed: set the seed of the random generator to replicate experiments. Ignored if protocol='gen'.
:param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
@@ -57,65 +33,27 @@ class GridSearchQ(BaseQuantifier):
def __init__(self,
model: BaseQuantifier,
param_grid: dict,
- sample_size: Union[int, None],
- protocol='app',
- n_prevpoints: int = None,
- n_repetitions: int = 1,
- eval_budget: int = None,
+ protocol: AbstractProtocol,
error: Union[Callable, str] = qp.error.mae,
refit=True,
- val_split=0.4,
- n_jobs=1,
- random_seed=42,
timeout=-1,
+ n_jobs=1,
verbose=False):
self.model = model
self.param_grid = param_grid
- self.sample_size = sample_size
- self.protocol = protocol.lower()
- self.n_prevpoints = n_prevpoints
- self.n_repetitions = n_repetitions
- self.eval_budget = eval_budget
+ self.protocol = protocol
self.refit = refit
- self.val_split = val_split
- self.n_jobs = n_jobs
- self.random_seed = random_seed
self.timeout = timeout
+ self.n_jobs = n_jobs
self.verbose = verbose
self.__check_error(error)
- assert self.protocol in {'app', 'npp', 'gen'}, \
- 'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
- 'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
- 'sample (instances) and their prevalence (ndarray) at each iteration.'
- assert self.eval_budget is None or isinstance(self.eval_budget, int)
- if self.protocol in ['npp', 'gen']:
- if self.protocol=='npp' and (self.eval_budget is None or self.eval_budget <= 0):
- raise ValueError(f'when protocol="npp" the parameter eval_budget should be '
- f'indicated (and should be >0).')
- if self.n_repetitions != 1:
- print('[warning] n_repetitions has been set and will be ignored for the selected protocol')
+ assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
def _sout(self, msg):
if self.verbose:
print(f'[{self.__class__.__name__}]: {msg}')
- def __check_training_validation(self, training, validation):
- if isinstance(validation, LabelledCollection):
- return training, validation
- elif isinstance(validation, float):
- assert 0. < validation < 1., 'validation proportion should be in (0,1)'
- training, validation = training.split_stratified(train_prop=1 - validation)
- return training, validation
- elif self.protocol=='gen' and inspect.isgenerator(validation()):
- return training, validation
- else:
- raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
- f'proportion of training documents to extract (type found: {type(validation)}). '
- f'Optionally, "validation" can be a callable function returning a generator that yields '
- f'the sample instances along with their true prevalence at each iteration by '
- f'setting protocol="gen".')
-
def __check_error(self, error):
if error in qp.error.QUANTIFICATION_ERROR:
self.error = error
@@ -127,96 +65,86 @@ class GridSearchQ(BaseQuantifier):
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
- def __generate_predictions(self, model, val_split):
- commons = {
- 'n_repetitions': self.n_repetitions,
- 'n_jobs': self.n_jobs,
- 'random_seed': self.random_seed,
- 'verbose': False
- }
- if self.protocol == 'app':
- return artificial_prevalence_prediction(
- model, val_split, self.sample_size,
- n_prevpoints=self.n_prevpoints,
- eval_budget=self.eval_budget,
- **commons
- )
- elif self.protocol == 'npp':
- return natural_prevalence_prediction(
- model, val_split, self.sample_size,
- **commons)
- elif self.protocol == 'gen':
- return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
- else:
- raise ValueError('unknown protocol')
-
- def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
+ def fit(self, training: LabelledCollection):
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
- :param val_split: either a LabelledCollection on which to test the performance of the different settings, or
- a float in [0,1] indicating the proportion of labelled data to extract from the training set
:return: self
"""
- if val_split is None:
- val_split = self.val_split
- training, val_split = self.__check_training_validation(training, val_split)
- if self.protocol != 'gen':
- assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
-
params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values())
- model = self.model
-
- if self.timeout > 0:
- def handler(signum, frame):
- self._sout('timeout reached')
- raise TimeoutError()
-
- signal.signal(signal.SIGALRM, handler)
+ protocol = self.protocol
+ n_jobs = self.n_jobs
self.param_scores_ = {}
self.best_score_ = None
- some_timeouts = False
- for values in itertools.product(*params_values):
- params = dict({k: values[i] for i, k in enumerate(params_keys)})
- if self.timeout > 0:
- signal.alarm(self.timeout)
+ hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
+ scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
- try:
- # overrides default parameters with the parameters being explored at this iteration
- model.set_params(**params)
- model.fit(training)
- true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
- score = self.error(true_prevalences, estim_prevalences)
-
- self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
+ for params, score, model in scores:
+ if score is not None:
if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score
self.best_params_ = params
- self.best_model_ = deepcopy(model)
+ self.best_model_ = model
self.param_scores_[str(params)] = score
+ else:
+ self.param_scores_[str(params)] = 'timeout'
- if self.timeout > 0:
- signal.alarm(0)
- except TimeoutError:
- print(f'timeout reached for config {params}')
- some_timeouts = True
-
- if self.best_score_ is None and some_timeouts:
+ if self.best_score_ is None:
raise TimeoutError('all jobs took more than the timeout time to end')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
if self.refit:
- self._sout(f'refitting on the whole development set')
- self.best_model_.fit(training + val_split)
+ if isinstance(protocol, OnLabelledCollectionProtocol):
+ self._sout(f'refitting on the whole development set')
+ self.best_model_.fit(training + protocol.get_labelled_collection())
+ else:
+ raise RuntimeWarning(f'"refit" was requested, but the protocol does not '
+ f'implement the {OnLabelledCollectionProtocol.__name__} interface')
return self
+ def _delayed_eval(self, args):
+ params, training = args
+
+ protocol = self.protocol
+ error = self.error
+
+ if self.timeout > 0:
+ def handler(signum, frame):
+ raise TimeoutError()
+
+ signal.signal(signal.SIGALRM, handler)
+
+ tinit = time()
+
+ if self.timeout > 0:
+ signal.alarm(self.timeout)
+
+ try:
+ model = deepcopy(self.model)
+ # overrides default parameters with the parameters being explored at this iteration
+ model.set_params(**params)
+ model.fit(training)
+ score = evaluation.evaluate(model, protocol=protocol, error_metric=error)
+
+ ttime = time()-tinit
+ self._sout(f'hyperparams={params}\t got {error.__name__} score {score:.5f} [took {ttime:.4f}s]')
+
+ if self.timeout > 0:
+ signal.alarm(0)
+ except TimeoutError:
+ self._sout(f'timeout ({self.timeout}s) reached for config {params}')
+ score = None
+
+ return params, score, model
+
+
def quantify(self, instances):
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
@@ -227,14 +155,6 @@ class GridSearchQ(BaseQuantifier):
assert hasattr(self, 'best_model_'), 'quantify called before fit'
return self.best_model().quantify(instances)
- @property
- def classes_(self):
- """
- Classes on which the quantifier has been trained on.
- :return: a ndarray of shape `(n_classes)` with the class identifiers
- """
- return self.best_model().classes_
-
def set_params(self, **parameters):
"""Sets the hyper-parameters to explore.
@@ -260,3 +180,5 @@ class GridSearchQ(BaseQuantifier):
if hasattr(self, 'best_model_'):
return self.best_model_
raise ValueError('best_model called before fit')
+
+
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 43bb0ef..70a98d9 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -1,12 +1,16 @@
+from copy import deepcopy
+
+import quapy as qp
import numpy as np
import itertools
from collections.abc import Generator
from contextlib import ExitStack
from abc import ABCMeta, abstractmethod
-
from quapy.data import LabelledCollection
import quapy.functional as F
from tqdm import tqdm
+from os.path import exists
+from glob import glob
# 0.1.7
@@ -61,6 +65,8 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
the sequence will be different every time the protocol is called.
"""
+ _random_seed = -1 # means "not set"
+
def __init__(self, seed=None):
self.random_seed = seed
@@ -93,13 +99,47 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
def __call__(self):
with ExitStack() as stack:
+ if self.random_seed == -1:
+ raise ValueError('The random seed has never been initialized. '
+ 'Set it to None not to impose replicability.')
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
for params in self.samples_parameters():
yield self.sample(params)
-class APP(AbstractStochasticSeededProtocol):
+class OnLabelledCollectionProtocol:
+ def get_labelled_collection(self):
+ return self.data
+
+ def on_preclassified_instances(self, pre_classifications, in_place=False):
+ assert len(pre_classifications) == len(self.data), \
+ f'error: the pre-classified data has different shape ' \
+ f'(expected {len(self.data)}, found {len(pre_classifications)})'
+ if in_place:
+ self.data.instances = pre_classifications
+ return self
+ else:
+ new = deepcopy(self)
+ return new.on_preclassified_instances(pre_classifications, in_place=True)
+
+
+class LoadSamplesFromDirectory(AbstractProtocol):
+
+ def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
+ assert exists(folder_path), f'folder {folder_path} does not exist'
+ assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
+ self.folder_path = folder_path
+ self.loader_fn = loader_fn
+ self.classes = classes
+ self.loader_kwargs = loader_kwargs
+
+ def __call__(self):
+ for file in sorted(glob(self.folder_path, '*')):
+ yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
+
+
+class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
@@ -123,7 +163,7 @@ class APP(AbstractStochasticSeededProtocol):
self.n_prevalences = n_prevalences
self.repeats = repeats
- def prevalence_grid(self, dimensions):
+ def prevalence_grid(self):
"""
Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
@@ -134,14 +174,14 @@ class APP(AbstractStochasticSeededProtocol):
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
- :param dimensions: the number of classes
:return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
`(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
in the grid multiplied by `repeat`
"""
+ dimensions = self.data.n_classes
s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
s = [s] * (dimensions - 1)
- prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
+ prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) <= 1.0)]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
if self.repeats > 1:
prevs = np.repeat(prevs, self.repeats, axis=0)
@@ -149,8 +189,8 @@ class APP(AbstractStochasticSeededProtocol):
def samples_parameters(self):
indexes = []
- for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
- index = data.sampling_index(self.sample_size, *prevs)
+ for prevs in self.prevalence_grid():
+ index = self.data.sampling_index(self.sample_size, *prevs)
indexes.append(index)
return indexes
@@ -161,7 +201,7 @@ class APP(AbstractStochasticSeededProtocol):
return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
-class NPP(AbstractStochasticSeededProtocol):
+class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
@@ -182,7 +222,7 @@ class NPP(AbstractStochasticSeededProtocol):
def samples_parameters(self):
indexes = []
for _ in range(self.repeats):
- index = data.uniform_sampling_index(self.sample_size)
+ index = self.data.uniform_sampling_index(self.sample_size)
indexes.append(index)
return indexes
@@ -193,8 +233,7 @@ class NPP(AbstractStochasticSeededProtocol):
return self.repeats
-
-class USimplexPP(AbstractStochasticSeededProtocol):
+class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
@@ -218,8 +257,8 @@ class USimplexPP(AbstractStochasticSeededProtocol):
def samples_parameters(self):
indexes = []
- for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
- index = data.sampling_index(self.sample_size, *prevs)
+ for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats):
+ index = self.data.sampling_index(self.sample_size, *prevs)
indexes.append(index)
return indexes
@@ -230,7 +269,6 @@ class USimplexPP(AbstractStochasticSeededProtocol):
return self.repeats
-
class CovariateShiftPP(AbstractStochasticSeededProtocol):
"""
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
@@ -300,33 +338,3 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
return self.repeats * len(self.mixture_points)
-
-
-if __name__=='__main__':
- import numpy as np
- import quapy as qp
-
- # domainA
- y = [0]*25 + [1]*25 + [2]*25 + [3]*25
- X = ['A:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
- data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
-
- # domain B
- y = [0]*25 + [1]*25 + [2]*25 + [3]*25
- X = ['B:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
- dataB = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
-
- # p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
- # p = NPP(data, sample_size=10, repeats=10, random_seed=42)
- # p = NPP(data, sample_size=10, repeats=10)
- # p = USimplexPP(data, sample_size=10, repeats=10)
- p = CovariateShiftPP(data, dataB, sample_size=10, mixture_points=11, random_seed=1)
-
- for _ in range(2):
- print('init generator', p.__class__.__name__)
- for i in tqdm(p(), total=p.total()):
- # print(i)
- print(i.instances, i.labels, i.prevalence())
-
- print('done')
-
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
new file mode 100644
index 0000000..de6603b
--- /dev/null
+++ b/quapy/tests/test_evaluation.py
@@ -0,0 +1,57 @@
+import unittest
+import quapy as qp
+from sklearn.linear_model import LogisticRegression
+from time import time
+from method.aggregative import EMQ
+from method.base import BaseQuantifier
+
+
+class EvalTestCase(unittest.TestCase):
+ def test_eval_speedup(self):
+
+ data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
+ train, test = data.training, data.test
+
+ protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
+
+ class SlowLR(LogisticRegression):
+ def predict_proba(self, X):
+ import time
+ time.sleep(1)
+ return super().predict_proba(X)
+
+ emq = EMQ(SlowLR()).fit(train)
+
+ tinit = time()
+ score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+ tend_optim = time()-tinit
+ print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
+
+ class NonAggregativeEMQ(BaseQuantifier):
+
+ def __init__(self, cls):
+ self.emq = EMQ(cls)
+
+ def quantify(self, instances):
+ return self.emq.quantify(instances)
+
+ def fit(self, data):
+ self.emq.fit(data)
+ return self
+
+ def set_params(self, **parameters): pass
+ def get_params(self, deep=True): pass
+
+
+ emq = NonAggregativeEMQ(SlowLR()).fit(train)
+
+ tinit = time()
+ score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+ tend_no_optim = time() - tinit
+ print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
+
+ self.assertEqual(tend_no_optim>tend_optim, True)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py
new file mode 100644
index 0000000..21af4b6
--- /dev/null
+++ b/quapy/tests/test_hierarchy.py
@@ -0,0 +1,32 @@
+import unittest
+
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+from quapy.method.aggregative import *
+
+
+
+class HierarchyTestCase(unittest.TestCase):
+
+ def test_aggregative(self):
+ lr = LogisticRegression()
+ for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]:
+ self.assertEqual(isinstance(m, AggregativeQuantifier), True)
+
+ def test_binary(self):
+ lr = LogisticRegression()
+ for m in [HDy(lr)]:
+ self.assertEqual(isinstance(m, BinaryQuantifier), True)
+
+ def test_probabilistic(self):
+ lr = LogisticRegression()
+ for m in [CC(lr), ACC(lr)]:
+ self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), False)
+ for m in [PCC(lr), PACC(lr)]:
+ self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), True)
+
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
new file mode 100644
index 0000000..637f831
--- /dev/null
+++ b/quapy/tests/test_modsel.py
@@ -0,0 +1,77 @@
+import unittest
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+
+import quapy as qp
+from method.aggregative import PACC
+from model_selection import GridSearchQ
+from protocol import APP
+
+
+class ModselTestCase(unittest.TestCase):
+
+ def test_modsel(self):
+
+ q = PACC(LogisticRegression(random_state=1, max_iter=5000))
+
+ data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+ training, validation = data.training.split_stratified(0.7, random_state=1)
+ # test = data.test
+
+ param_grid = {'C': np.logspace(-3,3,7)}
+ app = APP(validation, sample_size=100, random_seed=1)
+ q = GridSearchQ(
+ q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
+ ).fit(training)
+ print('best params', q.best_params_)
+ print('best score', q.best_score_)
+
+ self.assertEqual(q.best_params_['C'], 10.0)
+ self.assertEqual(q.best_model().get_params()['C'], 10.0)
+
+ def test_modsel_parallel(self):
+
+ q = PACC(LogisticRegression(random_state=1, max_iter=5000))
+
+ data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+ training, validation = data.training.split_stratified(0.7, random_state=1)
+ # test = data.test
+
+ param_grid = {'C': np.logspace(-3,3,7)}
+ app = APP(validation, sample_size=100, random_seed=1)
+ q = GridSearchQ(
+ q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
+ ).fit(training)
+ print('best params', q.best_params_)
+ print('best score', q.best_score_)
+
+ self.assertEqual(q.best_params_['C'], 10.0)
+ self.assertEqual(q.best_model().get_params()['C'], 10.0)
+
+ def test_modsel_timeout(self):
+
+ class SlowLR(LogisticRegression):
+ def fit(self, X, y, sample_weight=None):
+ import time
+ time.sleep(10)
+ super(SlowLR, self).fit(X, y, sample_weight)
+
+ q = PACC(SlowLR())
+
+ data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+ training, validation = data.training.split_stratified(0.7, random_state=1)
+ # test = data.test
+
+ param_grid = {'C': np.logspace(-3,3,7)}
+ app = APP(validation, sample_size=100, random_seed=1)
+ q = GridSearchQ(
+ q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
+ )
+ with self.assertRaises(TimeoutError):
+ q.fit(training)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
new file mode 100644
index 0000000..bf92ce5
--- /dev/null
+++ b/quapy/tests/test_protocols.py
@@ -0,0 +1,139 @@
+import unittest
+import numpy as np
+from data import LabelledCollection
+from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
+
+
+def mock_labelled_collection(prefix=''):
+ y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250
+ X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)]
+ return LabelledCollection(X, y, classes_=sorted(np.unique(y)))
+
+
+def samples_to_str(protocol):
+ samples_str = ""
+ for sample in protocol():
+ samples_str += f'{sample.instances}\t{sample.labels}\t{sample.prevalence()}\n'
+ return samples_str
+
+
+class TestProtocols(unittest.TestCase):
+
+ def test_app_replicate(self):
+ data = mock_labelled_collection()
+ p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertEqual(samples1, samples2)
+
+ def test_app_not_replicate(self):
+ data = mock_labelled_collection()
+ p = APP(data, sample_size=5, n_prevalences=11)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertNotEqual(samples1, samples2)
+
+ def test_app_number(self):
+ data = mock_labelled_collection()
+ p = APP(data, sample_size=100, n_prevalences=10, repeats=1)
+
+ # surprisingly enough, for some n_prevalences the test fails, notwithstanding
+ # everything is correct. The problem is that in function APP.prevalence_grid()
+ # there is sometimes one rounding error that gets cumulated and
+ # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like)
+ # so these tuples are mistakenly removed... I have tried with np.close, and
+ # other workarounds, but eventually happens that there is some negative probability
+ # in the sampling function...
+
+ count = 0
+ for _ in p():
+ count+=1
+
+ self.assertEqual(count, p.total())
+
+ def test_npp_replicate(self):
+ data = mock_labelled_collection()
+ p = NPP(data, sample_size=5, repeats=5, random_seed=42)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertEqual(samples1, samples2)
+
+ def test_npp_not_replicate(self):
+ data = mock_labelled_collection()
+ p = NPP(data, sample_size=5, repeats=5)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertNotEqual(samples1, samples2)
+
+ def test_kraemer_replicate(self):
+ data = mock_labelled_collection()
+ p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertEqual(samples1, samples2)
+
+ def test_kraemer_not_replicate(self):
+ data = mock_labelled_collection()
+ p = USimplexPP(data, sample_size=5, repeats=10)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertNotEqual(samples1, samples2)
+
+ def test_covariate_shift_replicate(self):
+ dataA = mock_labelled_collection('domA')
+ dataB = mock_labelled_collection('domB')
+ p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertEqual(samples1, samples2)
+
+ def test_covariate_shift_not_replicate(self):
+ dataA = mock_labelled_collection('domA')
+ dataB = mock_labelled_collection('domB')
+ p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11)
+
+ samples1 = samples_to_str(p)
+ samples2 = samples_to_str(p)
+
+ self.assertNotEqual(samples1, samples2)
+
+ def test_no_seed_init(self):
+ class NoSeedInit(AbstractStochasticSeededProtocol):
+ def __init__(self):
+ self.data = mock_labelled_collection()
+
+ def samples_parameters(self):
+ # return a matrix containing sampling indexes in the rows
+ return np.random.randint(0, len(self.data), 10*10).reshape(10, 10)
+
+ def sample(self, params):
+ index = np.unique(params)
+ return self.data.sampling_from_index(index)
+
+ p = NoSeedInit()
+
+ # this should raise a ValueError, since the class is said to be AbstractStochasticSeededProtocol but the
+ # random_seed has never been passed to super(NoSeedInit, self).__init__(random_seed)
+ with self.assertRaises(ValueError):
+ for sample in p():
+ pass
+ print('done')
+
+
+
+if __name__ == '__main__':
+ unittest.main()
From eba6fd8123a0ba4354df4cc508f724667d473a8a Mon Sep 17 00:00:00 2001
From: Alex Moreo
Date: Thu, 26 May 2022 17:59:23 +0200
Subject: [PATCH 07/67] optimization conditional in the prediction function
---
quapy/CHANGE_LOG.txt | 23 +++++++--
quapy/evaluation.py | 74 ++++++++++++++---------------
quapy/functional.py | 1 -
quapy/method/aggregative.py | 40 +++-------------
quapy/method/base.py | 93 ++++++++++++++++++-------------------
quapy/protocol.py | 30 +++++-------
6 files changed, 119 insertions(+), 142 deletions(-)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index a372109..fe39fc3 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -1,9 +1,9 @@
# main changes in 0.1.7
-- Protocols is now an abstraction, AbstractProtocol. There is a new class extending AbstractProtocol called
+- Protocols are now abstracted as AbstractProtocol. There is a new class extending AbstractProtocol called
AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings.
There are some examples of protocols, APP, NPP, USimplexPP, CovariateShiftPP (experimental).
- The idea is to start the sampling by simpli calling the __call__ method.
+ The idea is to start the sampling by simply calling the __call__ method.
This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection,
and sampling functions in LabelledCollection make use of the old functions.
@@ -11,7 +11,6 @@
Things to fix:
-- eval budget policy?
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
internally and not imposed in any abstract class)
@@ -31,4 +30,20 @@ Things to fix:
return instead crisp decisions. The idea was to unify the quantification function (i.e., now it is always
classify & aggregate, irrespective of the class). However, this has caused a problem with OneVsAll. This has to
be checked, since it is now innecessarily complicated (it also has old references to .probabilistic, and all this
- stuff).
\ No newline at end of file
+ stuff).
+- Check method def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll
+
+# 0.1.7
+# change the LabelledCollection API (removing protocol-related samplings)
+# need to change the two references to the above in the wiki / doc, and code examples...
+# removed artificial_prevalence_sampling from functional
+
+# also: some parameters in the init could be used to indicate that the method should return a tuple with
+# unlabelled instances and the vector of prevalence values (and not a LabelledCollection).
+# Or: this can be done in a different function; i.e., we use one function (now __call__) to return
+# LabelledCollections, and another new one for returning the other output, which is more general for
+# evaluation purposes.
+
+# the so-called "gen" function has to be implemented as a protocol. The problem here is that this function
+# should be able to return only unlabelled instances plus a vector of prevalences (and not LabelledCollections).
+# This was coded as different functions in 0.1.6
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index 0ea417d..d32cfb7 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -11,16 +11,35 @@ import quapy.functional as F
import pandas as pd
-def prediction(model: BaseQuantifier, protocol: AbstractProtocol, verbose=False):
+def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='auto', verbose=False):
+ assert aggr_speedup in [False, True, 'auto', 'force'], 'invalid value for aggr_speedup'
+
sout = lambda x: print(x) if verbose else None
- from method.aggregative import AggregativeQuantifier
- if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
- sout('speeding up the prediction for the aggregative quantifier')
+
+ apply_optimization = False
+
+ if aggr_speedup in [True, 'auto', 'force']:
+ # checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
+ # of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
+ # classify using the protocol would exceed the number of test documents in the original collection
+ from method.aggregative import AggregativeQuantifier
+ if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
+ if aggr_speedup == 'force':
+ apply_optimization = True
+ sout(f'forcing aggregative speedup')
+ elif hasattr(protocol, 'sample_size'):
+ nD = len(protocol.get_labelled_collection())
+ samplesD = protocol.total() * protocol.sample_size
+ if nD < samplesD:
+ apply_optimization = True
+ sout(f'speeding up the prediction for the aggregative quantifier, '
+ f'total classifications {nD} instead of {samplesD}')
+
+ if apply_optimization:
pre_classified = model.classify(protocol.get_labelled_collection().instances)
- return __prediction_helper(model.aggregate, protocol.on_preclassified_instances(pre_classified), verbose)
+ protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
+ return __prediction_helper(model.aggregate, protocol_with_predictions, verbose)
else:
- sout(f'the method is not aggregative, or the protocol is not an instance of '
- f'{OnLabelledCollectionProtocol.__name__}, so no optimization can be carried out')
return __prediction_helper(model.quantify, protocol, verbose)
@@ -38,10 +57,11 @@ def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=F
def evaluation_report(model: BaseQuantifier,
protocol: AbstractProtocol,
- error_metrics:Iterable[Union[str,Callable]]='mae',
+ error_metrics: Iterable[Union[str,Callable]] = 'mae',
+ aggr_speedup='auto',
verbose=False):
- true_prevs, estim_prevs = prediction(model, protocol, verbose)
+ true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
@@ -65,38 +85,18 @@ def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[st
return df
-def evaluate(model: BaseQuantifier, protocol: AbstractProtocol, error_metric:Union[str, Callable], verbose=False):
+def evaluate(
+ model: BaseQuantifier,
+ protocol: AbstractProtocol,
+ error_metric:Union[str, Callable],
+ aggr_speedup='auto',
+ verbose=False):
+
if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric)
- true_prevs, estim_prevs = prediction(model, protocol, verbose)
+ true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
return error_metric(true_prevs, estim_prevs)
-def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, repeats=1, verbose=False):
- if n_prevpoints is None and eval_budget is None:
- raise ValueError('either n_prevpoints or eval_budget has to be specified')
- elif n_prevpoints is None:
- assert eval_budget > 0, 'eval_budget must be a positive integer'
- n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
- eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
- if verbose:
- print(f'setting n_prevpoints={n_prevpoints} so that the number of '
- f'evaluations ({eval_computations}) does not exceed the evaluation '
- f'budget ({eval_budget})')
- elif eval_budget is None:
- eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
- if verbose:
- print(f'{eval_computations} evaluations will be performed for each '
- f'combination of hyper-parameters')
- else:
- eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
- if eval_computations > eval_budget:
- n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
- new_eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
- if verbose:
- print(f'the budget of evaluations would be exceeded with '
- f'n_prevpoints={n_prevpoints}. Chaning to n_prevpoints={n_prevpoints}. This will produce '
- f'{new_eval_computations} evaluation computations for each hyper-parameter combination.')
- return n_prevpoints, eval_computations
diff --git a/quapy/functional.py b/quapy/functional.py
index 215d89f..e44dacf 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -4,7 +4,6 @@ import scipy
import numpy as np
-
def prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01):
"""
Produces an array of uniformly separated values of prevalence.
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index ea9cbc0..ca4b25c 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -1023,15 +1023,18 @@ class OneVsAll(AggregativeQuantifier):
"""
def __init__(self, binary_quantifier, n_jobs=-1):
+ assert isinstance(self.binary_quantifier, BaseQuantifier), \
+ f'{self.binary_quantifier} does not seem to be a Quantifier'
+ assert isinstance(self.binary_quantifier, AggregativeQuantifier), \
+ f'{self.binary_quantifier} does not seem to be of type Aggregative'
self.binary_quantifier = binary_quantifier
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True):
assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data'
- assert isinstance(self.binary_quantifier, BaseQuantifier), \
- f'{self.binary_quantifier} does not seem to be a Quantifier'
- assert fit_learner == True, 'fit_learner must be True'
+ assert fit_learner == True, \
+ 'fit_learner must be True'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data)
@@ -1057,42 +1060,11 @@ class OneVsAll(AggregativeQuantifier):
return np.swapaxes(classif_predictions, 0, 1)
else:
return classif_predictions.T
- #
- # def posterior_probabilities(self, instances):
- # """
- # Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
- # `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
- # (resp. does not belong) to class `j`.
- # The posterior probabilities are independent of each other, meaning that, in general, they do not sum
- # up to one.
- #
- # :param instances: array-like
- # :return: `np.ndarray`
- # """
- #
- # if not isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
- # raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
- # f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
- # f'probabilistic')
- # posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
- # return np.swapaxes(posterior_predictions_bin, 0, 1)
def aggregate(self, classif_predictions):
- # if self.probabilistic:
- # assert classif_predictions.shape[1] == self.n_classes and classif_predictions.shape[2] == 2, \
- # 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
- # 'probabilities (2 dimensions) for each document (row) and class (columns)'
- # else:
- # assert set(np.unique(classif_predictions)).issubset({0, 1}), \
- # 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
- # 'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions)
return F.normalize_prevalence(prevalences)
- # def quantify(self, X):
- # predictions = self.classify(X)
- # return self.aggregate(predictions)
-
def __parallel(self, func, *args, **kwargs):
return np.asarray(
# some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
diff --git a/quapy/method/base.py b/quapy/method/base.py
index 55e18c7..6c2a0c5 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -1,4 +1,5 @@
from abc import ABCMeta, abstractmethod
+from copy import deepcopy
from quapy.data import LabelledCollection
@@ -62,52 +63,50 @@ class BinaryQuantifier(BaseQuantifier):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
-
-
-
-
-# class OneVsAll:
-# """
-# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
-# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
-# """
-#
-# def __init__(self, binary_method, n_jobs=-1):
-# self.binary_method = binary_method
-# self.n_jobs = n_jobs
-#
-# def fit(self, data: LabelledCollection, **kwargs):
-# assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
-# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
-# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
-# Parallel(n_jobs=self.n_jobs, backend='threading')(
-# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
-# )
-# return self
-#
-# def quantify(self, X, *args):
-# prevalences = np.asarray(
-# Parallel(n_jobs=self.n_jobs, backend='threading')(
-# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
-# )
-# )
-# return F.normalize_prevalence(prevalences)
-#
-# @property
-# def classes(self):
-# return sorted(self.class_method.keys())
-#
-# def set_params(self, **parameters):
-# self.binary_method.set_params(**parameters)
-#
-# def get_params(self, deep=True):
-# return self.binary_method.get_params()
-#
-# def _delayed_binary_predict(self, c, learners, X):
-# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
-#
-# def _delayed_binary_fit(self, c, learners, data, **kwargs):
-# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
-# learners[c].fit(bindata, **kwargs)
+class OneVsAllGeneric:
+ """
+ Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
+ quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
+ """
+
+ def __init__(self, binary_quantifier, n_jobs=1):
+ assert isinstance(binary_quantifier, BaseQuantifier), \
+ f'{binary_quantifier} does not seem to be a Quantifier'
+ self.binary_quantifier = binary_quantifier
+ self.n_jobs = n_jobs
+
+ def fit(self, data: LabelledCollection, **kwargs):
+ assert not data.binary, \
+ f'{self.__class__.__name__} expect non-binary data'
+ self.class_quatifier = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
+ Parallel(n_jobs=self.n_jobs, backend='threading')(
+ delayed(self._delayed_binary_fit)(c, self.class_quatifier, data, **kwargs) for c in data.classes_
+ )
+ return self
+
+ def quantify(self, X, *args):
+ prevalences = np.asarray(
+ Parallel(n_jobs=self.n_jobs, backend='threading')(
+ delayed(self._delayed_binary_predict)(c, self.class_quatifier, X) for c in self.classes
+ )
+ )
+ return F.normalize_prevalence(prevalences)
+
+ @property
+ def classes(self):
+ return sorted(self.class_quatifier.keys())
+
+ def set_params(self, **parameters):
+ self.binary_quantifier.set_params(**parameters)
+
+ def get_params(self, deep=True):
+ return self.binary_quantifier.get_params()
+
+ def _delayed_binary_predict(self, c, learners, X):
+ return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
+
+ def _delayed_binary_fit(self, c, learners, data, **kwargs):
+ bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
+ learners[c].fit(bindata, **kwargs)
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 70a98d9..d74e797 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -13,24 +13,6 @@ from os.path import exists
from glob import glob
-# 0.1.7
-# change the LabelledCollection API (removing protocol-related samplings)
-# need to change the two references to the above in the wiki / doc, and code examples...
-# removed artificial_prevalence_sampling from functional
-
-# maybe add some parameters in the init of the protocols (or maybe only for IndexableWhateverProtocols
-# indicating that the protocol should return indexes, and not samples themselves?
-# also: some parameters in the init could be used to indicate that the method should return a tuple with
-# unlabelled instances and the vector of prevalence values (and not a LabelledCollection).
-# Or: this can be done in a different function; i.e., we use one function (now __call__) to return
-# LabelledCollections, and another new one for returning the other output, which is more general for
-# evaluation purposes.
-
-# the so-called "gen" function has to be implemented as a protocol. The problem here is that this function
-# should be able to return only unlabelled instances plus a vector of prevalences (and not LabelledCollections).
-# This was coded as different functions in 0.1.6
-
-
class AbstractProtocol(metaclass=ABCMeta):
@abstractmethod
@@ -133,11 +115,21 @@ class LoadSamplesFromDirectory(AbstractProtocol):
self.loader_fn = loader_fn
self.classes = classes
self.loader_kwargs = loader_kwargs
+ self._list_files = None
def __call__(self):
- for file in sorted(glob(self.folder_path, '*')):
+ for file in self.list_files:
yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
+ @property
+ def list_files(self):
+ if self._list_files is None:
+ self._list_files = sorted(glob(self.folder_path, '*'))
+ return self._list_files
+
+ def total(self):
+ return len(self.list_files)
+
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
From 45642ad7789d37b96262102da3b3f3a9dbfb9d97 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Wed, 1 Jun 2022 18:28:59 +0200
Subject: [PATCH 08/67] lequa as dataset
---
quapy/CHANGE_LOG.txt | 16 ++++++++-
quapy/data/datasets.py | 53 ++++++++++++++++++++++++++++-
quapy/evaluation.py | 12 +++----
quapy/method/meta.py | 2 +-
quapy/model_selection.py | 7 +++-
quapy/protocol.py | 62 ++++++++++++++++++----------------
quapy/tests/test_datasets.py | 13 ++++++-
quapy/tests/test_evaluation.py | 10 +++---
quapy/tests/test_modsel.py | 33 +++++++++++++++++-
quapy/tests/test_protocols.py | 5 ++-
quapy/util.py | 1 +
11 files changed, 163 insertions(+), 51 deletions(-)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index fe39fc3..ab03b01 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -9,9 +9,19 @@
- ACC, PACC, Forman's threshold variants have been parallelized.
+- Exploration of hyperparameters in Model selection can now be run in parallel (it was a n_jobs argument in
+ QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel).
+
+- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that
+ consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in
+ which the total number of classifications would be smaller than the number of classifications with the standard
+ procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it
+ or not.
Things to fix:
-- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
+- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
+ this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
+ path of the imported class wrt the path of the class that arrives from another module...
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
internally and not imposed in any abstract class)
- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
@@ -33,6 +43,10 @@ Things to fix:
stuff).
- Check method def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll
+New features:
+- Add LeQua2022 to datasets (everything automatic, and with proper protocols "gen")
+- Add an "experimental room", with scripts to quickly test new ideas and see results.
+
# 0.1.7
# change the LabelledCollection API (removing protocol-related samplings)
# need to change the two references to the above in the wiki / doc, and code examples...
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 74e2a3e..06ba3d0 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -43,6 +43,8 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'wine-q-red', 'wine-q-white',
'yeast']
+LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
+
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
@@ -532,4 +534,53 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
- df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
\ No newline at end of file
+ df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
+
+
+def fetch_lequa2022(task, data_home=None):
+ """
+ """
+ from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
+
+ assert task in LEQUA2022_TASKS, \
+ f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
+ if data_home is None:
+ data_home = get_quapy_home()
+
+ URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
+ URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
+ URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
+
+ lequa_dir = join(data_home, 'lequa2022')
+ os.makedirs(lequa_dir, exist_ok=True)
+
+ def download_unzip_and_remove(unzipped_path, url):
+ tmp_path = join(lequa_dir, task + '_tmp.zip')
+ download_file_if_not_exists(url, tmp_path)
+ with zipfile.ZipFile(tmp_path) as file:
+ file.extractall(unzipped_path)
+ os.remove(tmp_path)
+
+ if not os.path.exists(join(lequa_dir, task)):
+ download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
+ download_unzip_and_remove(lequa_dir, URL_TEST)
+ download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
+
+ if task in ['T1A', 'T1B']:
+ load_fn = load_vector_documents
+ elif task in ['T2A', 'T2B']:
+ load_fn = load_raw_documents
+
+ tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
+ train = LabelledCollection.load(tr_path, loader_func=load_fn)
+
+ val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
+ val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
+ val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
+
+ test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
+ test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
+ test_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
+
+ return train, val_gen, test_gen
+
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index d32cfb7..57c2ed1 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -1,13 +1,9 @@
from typing import Union, Callable, Iterable
import numpy as np
from tqdm import tqdm
-import inspect
import quapy as qp
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
-from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
-from quapy.util import temp_seed
-import quapy.functional as F
import pandas as pd
@@ -22,7 +18,7 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
# checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
# of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
# classify using the protocol would exceed the number of test documents in the original collection
- from method.aggregative import AggregativeQuantifier
+ from quapy.method.aggregative import AggregativeQuantifier
if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
if aggr_speedup == 'force':
apply_optimization = True
@@ -45,9 +41,9 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
true_prevs, estim_prevs = [], []
- for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
- estim_prevs.append(quantification_fn(sample.instances))
- true_prevs.append(sample.prevalence())
+ for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
+ estim_prevs.append(quantification_fn(sample_instances))
+ true_prevs.append(sample_prev)
true_prevs = np.asarray(true_prevs)
estim_prevs = np.asarray(estim_prevs)
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index 3e57652..d5e8c2a 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -9,7 +9,6 @@ from tqdm import tqdm
import quapy as qp
from quapy import functional as F
from quapy.data import LabelledCollection
-from quapy.evaluation import evaluate
from quapy.model_selection import GridSearchQ
try:
@@ -176,6 +175,7 @@ class Ensemble(BaseQuantifier):
For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
the samples used for training the rest of the models in the ensemble.
"""
+ from quapy.evaluation import evaluate
error = qp.error.from_name(error_name)
tests = [m[3] for m in self.ensemble]
scores = []
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index c1fa817..7d71023 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -81,6 +81,8 @@ class GridSearchQ(BaseQuantifier):
self.param_scores_ = {}
self.best_score_ = None
+ tinit = time()
+
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
@@ -94,10 +96,13 @@ class GridSearchQ(BaseQuantifier):
else:
self.param_scores_[str(params)] = 'timeout'
+ tend = time()-tinit
+
if self.best_score_ is None:
raise TimeoutError('all jobs took more than the timeout time to end')
- self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
+ self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
+ f'[took {tend:.4f}s]')
if self.refit:
if isinstance(protocol, OnLabelledCollectionProtocol):
diff --git a/quapy/protocol.py b/quapy/protocol.py
index d74e797..f539830 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -1,14 +1,11 @@
from copy import deepcopy
-
import quapy as qp
import numpy as np
import itertools
-from collections.abc import Generator
from contextlib import ExitStack
from abc import ABCMeta, abstractmethod
from quapy.data import LabelledCollection
import quapy.functional as F
-from tqdm import tqdm
from os.path import exists
from glob import glob
@@ -87,10 +84,14 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
for params in self.samples_parameters():
- yield self.sample(params)
+ yield self.collator_fn(self.sample(params))
+
+ def set_collator(self, collator_fn):
+ self.collator_fn = collator_fn
class OnLabelledCollectionProtocol:
+
def get_labelled_collection(self):
return self.data
@@ -106,31 +107,6 @@ class OnLabelledCollectionProtocol:
return new.on_preclassified_instances(pre_classifications, in_place=True)
-class LoadSamplesFromDirectory(AbstractProtocol):
-
- def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
- assert exists(folder_path), f'folder {folder_path} does not exist'
- assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
- self.folder_path = folder_path
- self.loader_fn = loader_fn
- self.classes = classes
- self.loader_kwargs = loader_kwargs
- self._list_files = None
-
- def __call__(self):
- for file in self.list_files:
- yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
-
- @property
- def list_files(self):
- if self._list_files is None:
- self._list_files = sorted(glob(self.folder_path, '*'))
- return self._list_files
-
- def total(self):
- return len(self.list_files)
-
-
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
Implementation of the artificial prevalence protocol (APP).
@@ -154,6 +130,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
self.sample_size = sample_size
self.n_prevalences = n_prevalences
self.repeats = repeats
+ self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
def prevalence_grid(self):
"""
@@ -210,6 +187,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
+ self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
def samples_parameters(self):
indexes = []
@@ -246,6 +224,7 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
+ self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
def samples_parameters(self):
indexes = []
@@ -261,6 +240,31 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
return self.repeats
+# class LoadSamplesFromDirectory(AbstractProtocol):
+#
+# def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
+# assert exists(folder_path), f'folder {folder_path} does not exist'
+# assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
+# self.folder_path = folder_path
+# self.loader_fn = loader_fn
+# self.classes = classes
+# self.loader_kwargs = loader_kwargs
+# self._list_files = None
+#
+# def __call__(self):
+# for file in self.list_files:
+# yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
+#
+# @property
+# def list_files(self):
+# if self._list_files is None:
+# self._list_files = sorted(glob(self.folder_path, '*'))
+# return self._list_files
+#
+# def total(self):
+# return len(self.list_files)
+
+
class CovariateShiftPP(AbstractStochasticSeededProtocol):
"""
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py
index 88209e8..8d70fe9 100644
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@@ -1,7 +1,8 @@
import pytest
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
- TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset
+ TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
+ fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
@@ -41,3 +42,13 @@ def test_fetch_UCIDataset(dataset_name):
print('Training set stats')
dataset.training.stats()
print('Test set stats')
+
+
+@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
+def test_fetch_lequa2022(dataset_name):
+ fetch_lequa2022(dataset_name)
+ # dataset = fetch_lequa2022(dataset_name)
+ # print(f'Dataset {dataset_name}')
+ # print('Training set stats')
+ # dataset.training.stats()
+ # print('Test set stats')
\ No newline at end of file
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
index de6603b..73dc485 100644
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@@ -2,8 +2,8 @@ import unittest
import quapy as qp
from sklearn.linear_model import LogisticRegression
from time import time
-from method.aggregative import EMQ
-from method.base import BaseQuantifier
+from quapy.method.aggregative import EMQ
+from quapy.method.base import BaseQuantifier
class EvalTestCase(unittest.TestCase):
@@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
train, test = data.training, data.test
- protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
+ protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
class SlowLR(LogisticRegression):
def predict_proba(self, X):
@@ -23,7 +23,7 @@ class EvalTestCase(unittest.TestCase):
emq = EMQ(SlowLR()).fit(train)
tinit = time()
- score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+ score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
tend_optim = time()-tinit
print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
@@ -50,7 +50,7 @@ class EvalTestCase(unittest.TestCase):
tend_no_optim = time() - tinit
print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
- self.assertEqual(tend_no_optim>tend_optim, True)
+ self.assertEqual(tend_no_optim>(tend_optim/2), True)
if __name__ == '__main__':
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
index 637f831..9c6604a 100644
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@@ -8,6 +8,7 @@ import quapy as qp
from method.aggregative import PACC
from model_selection import GridSearchQ
from protocol import APP
+import time
class ModselTestCase(unittest.TestCase):
@@ -18,7 +19,6 @@ class ModselTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
training, validation = data.training.split_stratified(0.7, random_state=1)
- # test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
@@ -50,6 +50,37 @@ class ModselTestCase(unittest.TestCase):
self.assertEqual(q.best_params_['C'], 10.0)
self.assertEqual(q.best_model().get_params()['C'], 10.0)
+ def test_modsel_parallel_speedup(self):
+ class SlowLR(LogisticRegression):
+ def fit(self, X, y, sample_weight=None):
+ time.sleep(1)
+ return super(SlowLR, self).fit(X, y, sample_weight)
+
+ q = PACC(SlowLR(random_state=1, max_iter=5000))
+
+ data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+ training, validation = data.training.split_stratified(0.7, random_state=1)
+
+ param_grid = {'C': np.logspace(-3, 3, 7)}
+ app = APP(validation, sample_size=100, random_seed=1)
+
+ tinit = time.time()
+ GridSearchQ(
+ q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
+ ).fit(training)
+ tend_nooptim = time.time()-tinit
+
+ tinit = time.time()
+ GridSearchQ(
+ q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
+ ).fit(training)
+ tend_optim = time.time() - tinit
+
+ print(f'parallel training took {tend_optim:.4f}s')
+ print(f'sequential training took {tend_nooptim:.4f}s')
+
+ self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
+
def test_modsel_timeout(self):
class SlowLR(LogisticRegression):
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
index bf92ce5..b68567b 100644
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@@ -1,7 +1,7 @@
import unittest
import numpy as np
-from data import LabelledCollection
-from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
def mock_labelled_collection(prefix=''):
@@ -134,6 +134,5 @@ class TestProtocols(unittest.TestCase):
print('done')
-
if __name__ == '__main__':
unittest.main()
diff --git a/quapy/util.py b/quapy/util.py
index 9d44633..952c2da 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -46,6 +46,7 @@ def parallel(func, args, n_jobs):
that takes the `quapy.environ` variable as input silently
"""
+ print('n_jobs',n_jobs)
def func_dec(environ, *args):
qp.environ = environ
return func(*args)
From bfe4b8b51a42ce29bba812697f22aed451a9ec58 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 3 Jun 2022 13:51:22 +0200
Subject: [PATCH 09/67] updating properties of labelled collection
---
quapy/data/base.py | 38 ++++++++++++++++++++++++++++++++++++++
quapy/protocol.py | 17 ++++++++++++++---
2 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/quapy/data/base.py b/quapy/data/base.py
index c555692..b22a71f 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -63,6 +63,7 @@ class LabelledCollection:
"""
return self.instances.shape[0]
+ @property
def prevalence(self):
"""
Returns the prevalence, or relative frequency, of the classes of interest.
@@ -248,6 +249,43 @@ class LabelledCollection:
"""
return self.instances, self.labels
+ @property
+ def Xp(self):
+ """
+ Gets the instances and the true prevalence. This is useful when implementing evaluation protocols
+
+ :return: a tuple `(instances, prevalence)` from this collection
+ """
+ return self.instances, self.prevalence()
+
+ @property
+ def X(self):
+ """
+ An alias to self.instances
+
+ :return: self.instances
+ """
+ return self.instances
+
+ @property
+ def y(self):
+ """
+ An alias to self.labels
+
+ :return: self.labels
+ """
+ return self.labels
+
+ @property
+ def p(self):
+ """
+ An alias to self.prevalence()
+
+ :return: self.prevalence()
+ """
+ return self.prevalence()
+
+
def stats(self, show=True):
"""
Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
diff --git a/quapy/protocol.py b/quapy/protocol.py
index f539830..c55c3ef 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -84,14 +84,16 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
for params in self.samples_parameters():
- yield self.collator_fn(self.sample(params))
+ yield self.collator(self.sample(params))
- def set_collator(self, collator_fn):
- self.collator_fn = collator_fn
+ def collator(self, sample, *args):
+ return sample
class OnLabelledCollectionProtocol:
+ RETURN_TYPES = ['sample_prev', 'labelled_collection']
+
def get_labelled_collection(self):
return self.data
@@ -106,6 +108,15 @@ class OnLabelledCollectionProtocol:
new = deepcopy(self)
return new.on_preclassified_instances(pre_classifications, in_place=True)
+ @classmethod
+ def get_collator(cls, return_type='sample_prev'):
+ assert return_type in cls.RETURN_TYPES, \
+ f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}'
+ if return_type=='sample_prev':
+ return lambda lc:lc.Xp
+ elif return_type=='labelled_collection':
+ return lambda lc:lc
+
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
From 82a01478ec80eeb1fead5c320bb5e329a7ee9441 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 3 Jun 2022 18:02:52 +0200
Subject: [PATCH 10/67] collator functions in protocols for preparing the
outputs
---
quapy/data/base.py | 18 ++++++++++++++----
quapy/protocol.py | 14 +++++++-------
quapy/tests/test_datasets.py | 10 ++++------
quapy/tests/test_protocols.py | 4 ++--
4 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/quapy/data/base.py b/quapy/data/base.py
index b22a71f..4601c15 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -63,10 +63,9 @@ class LabelledCollection:
"""
return self.instances.shape[0]
- @property
def prevalence(self):
"""
- Returns the prevalence, or relative frequency, of the classes of interest.
+ Returns the prevalence, or relative frequency, of the classes in the codeframe.
:return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
as listed by `self.classes_`
@@ -75,7 +74,7 @@ class LabelledCollection:
def counts(self):
"""
- Returns the number of instances for each of the classes of interest.
+ Returns the number of instances for each of the classes in the codeframe.
:return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
as listed by `self.classes_`
@@ -252,7 +251,8 @@ class LabelledCollection:
@property
def Xp(self):
"""
- Gets the instances and the true prevalence. This is useful when implementing evaluation protocols
+ Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from
+ a `LabelledCollection` object.
:return: a tuple `(instances, prevalence)` from this collection
"""
@@ -420,6 +420,16 @@ class Dataset:
"""
return len(self.vocabulary)
+ @property
+ def train_test(self):
+ """
+ Alias to `self.training` and `self.test`
+
+ :return: the training and test collections
+ :return: the training and test collections
+ """
+ return self.training, self.test
+
def stats(self, show):
"""
Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
diff --git a/quapy/protocol.py b/quapy/protocol.py
index c55c3ef..fec37ca 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -135,13 +135,13 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param random_seed: allows replicating samples across runs (default None)
"""
- def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None):
+ def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'):
super(APP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
self.repeats = repeats
- self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
+ self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def prevalence_grid(self):
"""
@@ -192,13 +192,13 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param random_seed: allows replicating samples across runs (default None)
"""
- def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None):
+ def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
super(NPP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
- self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
+ self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
indexes = []
@@ -229,13 +229,13 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
:param random_seed: allows replicating samples across runs (default None)
"""
- def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None):
+ def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
super(USimplexPP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
- self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
+ self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
indexes = []
@@ -339,7 +339,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
indexesA, indexesB = indexes
sampleA = self.A.sampling_from_index(indexesA)
sampleB = self.B.sampling_from_index(indexesB)
- return sampleA+sampleB
+ return (sampleA+sampleB).Xp
def total(self):
return self.repeats * len(self.mixture_points)
diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py
index 8d70fe9..b0c2f7a 100644
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@@ -46,9 +46,7 @@ def test_fetch_UCIDataset(dataset_name):
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
def test_fetch_lequa2022(dataset_name):
- fetch_lequa2022(dataset_name)
- # dataset = fetch_lequa2022(dataset_name)
- # print(f'Dataset {dataset_name}')
- # print('Training set stats')
- # dataset.training.stats()
- # print('Test set stats')
\ No newline at end of file
+ train, gen_val, gen_test = fetch_lequa2022(dataset_name)
+ print(train.stats())
+ print('Val:', gen_val.total())
+ print('Test:', gen_test.total())
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
index b68567b..aeb1f4e 100644
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@@ -12,8 +12,8 @@ def mock_labelled_collection(prefix=''):
def samples_to_str(protocol):
samples_str = ""
- for sample in protocol():
- samples_str += f'{sample.instances}\t{sample.labels}\t{sample.prevalence()}\n'
+ for instances, prev in protocol():
+ samples_str += f'{instances}\t{prev}\n'
return samples_str
From 2cc7db60ccad5c1b70b9a8d2e7d492fd6d8b5357 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 14 Jun 2022 09:35:39 +0200
Subject: [PATCH 11/67] updating parallel policy to take n_jobs from
environment (not yet tested)
---
quapy/CHANGE_LOG.txt | 4 +++-
quapy/__init__.py | 9 ++++++++-
quapy/data/preprocessing.py | 5 +++--
quapy/method/aggregative.py | 18 ++++++++++--------
quapy/method/base.py | 7 ++++---
quapy/method/meta.py | 4 ++--
quapy/model_selection.py | 7 +++----
quapy/util.py | 6 ++++--
8 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index ab03b01..095bb76 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -18,6 +18,8 @@
procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it
or not.
+- n_jobs is now taken from the environment if set to None
+
Things to fix:
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
@@ -29,7 +31,7 @@ Things to fix:
- Policies should be able to set their output to "labelled_collection" or "instances_prevalence" or something similar.
- Policies should implement the "gen()" one, taking a reader function as an input, and a folder path maybe
- Review all documentation, redo the Sphinx doc, update Wikis...
-- Resolve the OneVsAll thing (it is in base.py and in aggregative.py
+- Resolve the OneVsAll thing (it is in base.py and in aggregative.py)
- Better handle the environment (e.g., with n_jobs)
- test cross_generate_predictions and cancel cross_generate_predictions_depr
- Add a proper log?
diff --git a/quapy/__init__.py b/quapy/__init__.py
index 2ef4c5c..54b1603 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -18,7 +18,14 @@ environ = {
'UNK_INDEX': 0,
'PAD_TOKEN': '[PAD]',
'PAD_INDEX': 1,
- 'SVMPERF_HOME': './svm_perf_quantification'
+ 'SVMPERF_HOME': './svm_perf_quantification',
+ 'N_JOBS': 1
}
+def get_njobs(n_jobs):
+ return environ['N_JOBS'] if n_jobs is None else n_jobs
+
+
+
+
diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py
index f04f010..a987900 100644
--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@@ -169,7 +169,7 @@ class IndexTransformer:
self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX'])
return self
- def transform(self, X, n_jobs=-1):
+ def transform(self, X, n_jobs=None):
"""
Transforms the strings in `X` as lists of numerical ids
@@ -179,6 +179,7 @@ class IndexTransformer:
"""
# given the number of tasks and the number of jobs, generates the slices for the parallel processes
assert self.unk != -1, 'transform called before fit'
+ n_jobs = qp.get_njobs(n_jobs)
indexed = map_parallel(func=self._index, args=X, n_jobs=n_jobs)
return np.asarray(indexed)
@@ -186,7 +187,7 @@ class IndexTransformer:
vocab = self.vocabulary_.copy()
return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
- def fit_transform(self, X, n_jobs=-1):
+ def fit_transform(self, X, n_jobs=None):
"""
Fits the transform on `X` and transforms it.
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index ca4b25c..c2f4717 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -207,6 +207,8 @@ def cross_generate_predictions(
n_jobs
):
+ n_jobs = qp.get_njobs(n_jobs)
+
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
@@ -331,10 +333,10 @@ class ACC(AggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
self.learner = learner
self.val_split = val_split
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
"""
@@ -437,10 +439,10 @@ class PACC(AggregativeProbabilisticQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
self.learner = learner
self.val_split = val_split
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
"""
@@ -769,10 +771,10 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
self.learner = learner
self.val_split = val_split
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
self._check_binary(data, "Threshold Optimization")
@@ -1022,13 +1024,13 @@ class OneVsAll(AggregativeQuantifier):
:param n_jobs: number of parallel workers
"""
- def __init__(self, binary_quantifier, n_jobs=-1):
+ def __init__(self, binary_quantifier, n_jobs=None):
assert isinstance(self.binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier'
assert isinstance(self.binary_quantifier, AggregativeQuantifier), \
f'{self.binary_quantifier} does not seem to be of type Aggregative'
self.binary_quantifier = binary_quantifier
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True):
assert not data.binary, \
diff --git a/quapy/method/base.py b/quapy/method/base.py
index 6c2a0c5..c935735 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -1,6 +1,6 @@
from abc import ABCMeta, abstractmethod
from copy import deepcopy
-
+import quapy as qp
from quapy.data import LabelledCollection
@@ -63,17 +63,18 @@ class BinaryQuantifier(BaseQuantifier):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
+
class OneVsAllGeneric:
"""
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
"""
- def __init__(self, binary_quantifier, n_jobs=1):
+ def __init__(self, binary_quantifier, n_jobs=None):
assert isinstance(binary_quantifier, BaseQuantifier), \
f'{binary_quantifier} does not seem to be a Quantifier'
self.binary_quantifier = binary_quantifier
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, **kwargs):
assert not data.binary, \
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index d5e8c2a..5e084e5 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -72,7 +72,7 @@ class Ensemble(BaseQuantifier):
policy='ave',
max_sample_size=None,
val_split:Union[qp.data.LabelledCollection, float]=None,
- n_jobs=1,
+ n_jobs=None,
verbose=False):
assert policy in Ensemble.VALID_POLICIES, \
f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}'
@@ -84,7 +84,7 @@ class Ensemble(BaseQuantifier):
self.red_size = red_size
self.policy = policy
self.val_split = val_split
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
self.post_proba_fn = None
self.verbose = verbose
self.max_sample_size = max_sample_size
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 7d71023..c227db8 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -37,7 +37,7 @@ class GridSearchQ(BaseQuantifier):
error: Union[Callable, str] = qp.error.mae,
refit=True,
timeout=-1,
- n_jobs=1,
+ n_jobs=None,
verbose=False):
self.model = model
@@ -45,7 +45,7 @@ class GridSearchQ(BaseQuantifier):
self.protocol = protocol
self.refit = refit
self.timeout = timeout
- self.n_jobs = n_jobs
+ self.n_jobs = qp.get_njobs(n_jobs)
self.verbose = verbose
self.__check_error(error)
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
@@ -76,7 +76,6 @@ class GridSearchQ(BaseQuantifier):
params_values = list(self.param_grid.values())
protocol = self.protocol
- n_jobs = self.n_jobs
self.param_scores_ = {}
self.best_score_ = None
@@ -84,7 +83,7 @@ class GridSearchQ(BaseQuantifier):
tinit = time()
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
- scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
+ scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=self.n_jobs)
for params, score, model in scores:
if score is not None:
diff --git a/quapy/util.py b/quapy/util.py
index 952c2da..259178e 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -11,7 +11,7 @@ import numpy as np
from joblib import Parallel, delayed
-def _get_parallel_slices(n_tasks, n_jobs=-1):
+def _get_parallel_slices(n_tasks, n_jobs):
if n_jobs == -1:
n_jobs = multiprocessing.cpu_count()
batch = int(n_tasks / n_jobs)
@@ -48,7 +48,9 @@ def parallel(func, args, n_jobs):
"""
print('n_jobs',n_jobs)
def func_dec(environ, *args):
- qp.environ = environ
+ qp.environ = environ.copy()
+ qp.environ['N_JOBS'] = 1
+ print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
return func(*args)
return Parallel(n_jobs=n_jobs)(
delayed(func_dec)(qp.environ, args_i) for args_i in args
From 789b9d5fbc963cb0e7e8c01ea1ee2338dc72fe1f Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Wed, 15 Jun 2022 14:36:02 +0200
Subject: [PATCH 12/67] pathfix in lequa2022 datasets
---
quapy/data/datasets.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 06ba3d0..8e58540 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -580,7 +580,7 @@ def fetch_lequa2022(task, data_home=None):
test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
- test_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
+ test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
return train, val_gen, test_gen
From c795404e7f0db29939d6b33093799c3482fbc7ab Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Wed, 15 Jun 2022 16:54:42 +0200
Subject: [PATCH 13/67] import fix
---
quapy/model_selection.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index c227db8..d627649 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -2,9 +2,9 @@ import itertools
import signal
from copy import deepcopy
from typing import Union, Callable
-import evaluation
import quapy as qp
-from protocol import AbstractProtocol, OnLabelledCollectionProtocol
+from quapy import evaluation
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
from quapy.data.base import LabelledCollection
from quapy.method.aggregative import BaseQuantifier
from time import time
From a7c768bb40b5f2d56743c7b5f9881dd79376346c Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Thu, 16 Jun 2022 16:38:34 +0200
Subject: [PATCH 14/67] param fix
---
quapy/data/base.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/data/base.py b/quapy/data/base.py
index 4601c15..1125449 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -430,7 +430,7 @@ class Dataset:
"""
return self.training, self.test
- def stats(self, show):
+ def stats(self, show=True):
"""
Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
From c0c37f0a178164aacbb626181a1fe43bd3973d37 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Thu, 16 Jun 2022 16:54:15 +0200
Subject: [PATCH 15/67] return type in covariate protocol
---
quapy/protocol.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/quapy/protocol.py b/quapy/protocol.py
index fec37ca..f8b828f 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -301,7 +301,8 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
repeats=1,
prevalence=None,
mixture_points=11,
- random_seed=None):
+ random_seed=None,
+ return_type='sample_prev'):
super(CovariateShiftPP, self).__init__(random_seed)
self.A = domainA
self.B = domainB
@@ -322,6 +323,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
self.random_seed = random_seed
+ self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
indexesA, indexesB = [], []
@@ -339,7 +341,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
indexesA, indexesB = indexes
sampleA = self.A.sampling_from_index(indexesA)
sampleB = self.B.sampling_from_index(indexesB)
- return (sampleA+sampleB).Xp
+ return self.collator(sampleA+sampleB)
def total(self):
return self.repeats * len(self.mixture_points)
From cf0bd14cf193c13da0328c12b78236b5408072ef Mon Sep 17 00:00:00 2001
From: Alex Moreo
Date: Fri, 17 Jun 2022 12:51:52 +0200
Subject: [PATCH 16/67] bug fix in covariate shift protocol
---
quapy/protocol.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/protocol.py b/quapy/protocol.py
index f8b828f..ac9680f 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -341,7 +341,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
indexesA, indexesB = indexes
sampleA = self.A.sampling_from_index(indexesA)
sampleB = self.B.sampling_from_index(indexesB)
- return self.collator(sampleA+sampleB)
+ return sampleA+sampleB
def total(self):
return self.repeats * len(self.mixture_points)
From f4a2a94ba503ff11b13f61b5401918d1c89f8fd6 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 21 Jun 2022 10:27:06 +0200
Subject: [PATCH 17/67] fixing random_state in base and in protocols
---
quapy/data/base.py | 16 ++++++----
quapy/protocol.py | 54 +++++++++++++++++-----------------
quapy/tests/test_evaluation.py | 2 +-
quapy/tests/test_modsel.py | 8 ++---
quapy/tests/test_protocols.py | 8 ++---
quapy/util.py | 1 -
6 files changed, 47 insertions(+), 42 deletions(-)
diff --git a/quapy/data/base.py b/quapy/data/base.py
index 1125449..3c9bb67 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -2,7 +2,7 @@ import numpy as np
from scipy.sparse import issparse
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
-
+from numpy.random import RandomState
from quapy.functional import strprev
@@ -146,16 +146,21 @@ class LabelledCollection:
return indexes_sample
- def uniform_sampling_index(self, size):
+ def uniform_sampling_index(self, size, random_state=None):
"""
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the size of the uniform sample
+ :param random_state: if specified, guarantees reproducibility of the split.
:return: a np.ndarray of shape `(size)` with the indexes
"""
- return np.random.choice(len(self), size, replace=size > len(self))
+ if random_state is not None:
+ ng = RandomState(seed=random_state)
+ else:
+ ng = np.random
+ return ng.choice(len(self), size, replace=size > len(self))
def sampling(self, size, *prevs, shuffle=True):
"""
@@ -174,16 +179,17 @@ class LabelledCollection:
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(prev_index)
- def uniform_sampling(self, size):
+ def uniform_sampling(self, size, random_state=None):
"""
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the requested size
+ :param random_state: if specified, guarantees reproducibility of the split.
:return: an instance of :class:`LabelledCollection` with length == `size`
"""
- unif_index = self.uniform_sampling_index(size)
+ unif_index = self.uniform_sampling_index(size, random_state=random_state)
return self.sampling_from_index(unif_index)
def sampling_from_index(self, index):
diff --git a/quapy/protocol.py b/quapy/protocol.py
index f8b828f..c232ebc 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
deterministically generates a sample.
- :param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
+ :param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
the sequence will be different every time the protocol is called.
"""
- _random_seed = -1 # means "not set"
+ _random_state = -1 # means "not set"
- def __init__(self, seed=None):
- self.random_seed = seed
+ def __init__(self, random_state=None):
+ self.random_state = random_state
@property
- def random_seed(self):
- return self._random_seed
+ def random_state(self):
+ return self._random_state
- @random_seed.setter
- def random_seed(self, seed):
- self._random_seed = seed
+ @random_state.setter
+ def random_state(self, random_state):
+ self._random_state = random_state
@abstractmethod
def samples_parameters(self):
@@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
def __call__(self):
with ExitStack() as stack:
- if self.random_seed == -1:
+ if self.random_state == -1:
raise ValueError('The random seed has never been initialized. '
'Set it to None not to impose replicability.')
- if self.random_seed is not None:
- stack.enter_context(qp.util.temp_seed(self.random_seed))
+ if self.random_state is not None:
+ stack.enter_context(qp.util.temp_seed(self.random_state))
for params in self.samples_parameters():
yield self.collator(self.sample(params))
@@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21)
:param repeats: number of copies for each valid prevalence vector (default is 10)
- :param random_seed: allows replicating samples across runs (default None)
+ :param random_state: allows replicating samples across runs (default None)
"""
- def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'):
- super(APP, self).__init__(random_seed)
+ def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
+ super(APP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
@@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100.
- :param random_seed: allows replicating samples across runs (default None)
+ :param random_state: allows replicating samples across runs (default None)
"""
- def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
- super(NPP, self).__init__(random_seed)
+ def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
+ super(NPP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
- self.random_seed = random_seed
+ self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
@@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
:param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100.
- :param random_seed: allows replicating samples across runs (default None)
+ :param random_state: allows replicating samples across runs (default None)
"""
- def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
- super(USimplexPP, self).__init__(random_seed)
+ def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
+ super(USimplexPP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
- self.random_seed = random_seed
+ self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
@@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
the specific points
- :param random_seed:
+ :param random_state:
"""
def __init__(
@@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
repeats=1,
prevalence=None,
mixture_points=11,
- random_seed=None,
+ random_state=None,
return_type='sample_prev'):
- super(CovariateShiftPP, self).__init__(random_seed)
+ super(CovariateShiftPP, self).__init__(random_state)
self.A = domainA
self.B = domainB
self.sample_size = sample_size
@@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
self.mixture_points = np.asarray(mixture_points)
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
- self.random_seed = random_seed
+ self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
index 73dc485..9a77867 100644
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
train, test = data.training, data.test
- protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
+ protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1)
class SlowLR(LogisticRegression):
def predict_proba(self, X):
diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py
index 9c6604a..d54dcbe 100644
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase):
training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3,3,7)}
- app = APP(validation, sample_size=100, random_seed=1)
+ app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
).fit(training)
@@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase):
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
- app = APP(validation, sample_size=100, random_seed=1)
+ app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
).fit(training)
@@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase):
training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3, 3, 7)}
- app = APP(validation, sample_size=100, random_seed=1)
+ app = APP(validation, sample_size=100, random_state=1)
tinit = time.time()
GridSearchQ(
@@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase):
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
- app = APP(validation, sample_size=100, random_seed=1)
+ app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
)
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
index aeb1f4e..dea3290 100644
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase):
def test_app_replicate(self):
data = mock_labelled_collection()
- p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
+ p = APP(data, sample_size=5, n_prevalences=11, random_state=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
@@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase):
def test_npp_replicate(self):
data = mock_labelled_collection()
- p = NPP(data, sample_size=5, repeats=5, random_seed=42)
+ p = NPP(data, sample_size=5, repeats=5, random_state=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
@@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase):
def test_kraemer_replicate(self):
data = mock_labelled_collection()
- p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
+ p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
@@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase):
def test_covariate_shift_replicate(self):
dataA = mock_labelled_collection('domA')
dataB = mock_labelled_collection('domB')
- p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
+ p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
diff --git a/quapy/util.py b/quapy/util.py
index 259178e..049ebed 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -50,7 +50,6 @@ def parallel(func, args, n_jobs):
def func_dec(environ, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
- print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
return func(*args)
return Parallel(n_jobs=n_jobs)(
delayed(func_dec)(qp.environ, args_i) for args_i in args
From 8f6aa629b83dc9ca1c689551cb31cb59f5b2c400 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 21 Jun 2022 10:49:30 +0200
Subject: [PATCH 18/67] param seed changed to random_state
---
quapy/util.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/quapy/util.py b/quapy/util.py
index 049ebed..cb1eab3 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -57,17 +57,17 @@ def parallel(func, args, n_jobs):
@contextlib.contextmanager
-def temp_seed(seed):
+def temp_seed(random_state):
"""
Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.:
>>> with temp_seed(random_seed):
>>> pass # do any computation depending on np.random functionality
- :param seed: the seed to set within the "with" context
+ :param random_state: the seed to set within the "with" context
"""
state = np.random.get_state()
- np.random.seed(seed)
+ np.random.seed(random_state)
try:
yield
finally:
From cf7d37c7934dba6590f56c876f94643386114d80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Tue, 21 Jun 2022 11:07:00 +0200
Subject: [PATCH 19/67] removing log message
---
quapy/util.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/quapy/util.py b/quapy/util.py
index cb1eab3..2ccf06d 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -46,7 +46,6 @@ def parallel(func, args, n_jobs):
that takes the `quapy.environ` variable as input silently
"""
- print('n_jobs',n_jobs)
def func_dec(environ, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
From 02dd2846ff4db54a6e6eedb35b15ffc98dad38bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Fri, 24 Jun 2022 14:05:47 +0200
Subject: [PATCH 20/67] changing app to use prevalence_linspace function with
smooth limits
---
quapy/protocol.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 69b99ad..7652eeb 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -132,15 +132,17 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21)
:param repeats: number of copies for each valid prevalence vector (default is 10)
+ :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
:param random_state: allows replicating samples across runs (default None)
"""
- def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
+ def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, smooth_limits_epsilon=0, random_state=None, return_type='sample_prev'):
super(APP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
self.repeats = repeats
+ self.smooth_limits_epsilon = smooth_limits_epsilon
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def prevalence_grid(self):
@@ -159,7 +161,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
in the grid multiplied by `repeat`
"""
dimensions = self.data.n_classes
- s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
+ s = F.prevalence_linspace(self.n_prevalences, repeats=1, smooth_limits_epsilon=self.smooth_limits_epsilon)
s = [s] * (dimensions - 1)
prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) <= 1.0)]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
From 750814ef2a4a74e60f5ecd857d784211813d6caf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Fri, 24 Jun 2022 14:20:08 +0200
Subject: [PATCH 21/67] fixing bug in ACC when using cross validation
---
quapy/method/aggregative.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index c2f4717..759a853 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -223,6 +223,7 @@ def cross_generate_predictions(
# fit the learner on all data
learner.fit(*data.Xy)
+ y = data.y
classes = data.classes_
else:
learner, val_data = _training_helper(
From 46e294002f3f3fc43149c41f08198ed810a4e33a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Mon, 11 Jul 2022 12:21:49 +0200
Subject: [PATCH 22/67] dys implementation
---
quapy/functional.py | 6 +++
quapy/method/__init__.py | 1 +
quapy/method/aggregative.py | 79 ++++++++++++++++++++++++++++++++++++-
3 files changed, 84 insertions(+), 2 deletions(-)
diff --git a/quapy/functional.py b/quapy/functional.py
index e44dacf..8cf0312 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -78,6 +78,12 @@ def HellingerDistance(P, Q):
"""
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
+def TopsoeDistance(P, Q, epsilon=1e-20):
+ """ Topsoe
+ """
+ return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) +
+ Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
+
def uniform_prevalence_sampling(n_classes, size=1):
"""
diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py
index ddd7b26..8a30451 100644
--- a/quapy/method/__init__.py
+++ b/quapy/method/__init__.py
@@ -19,6 +19,7 @@ AGGREGATIVE_METHODS = {
aggregative.PACC,
aggregative.EMQ,
aggregative.HDy,
+ aggregative.DyS,
aggregative.X,
aggregative.T50,
aggregative.MAX,
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 759a853..ac6fdc3 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -1,6 +1,7 @@
from abc import abstractmethod
from copy import deepcopy
-from typing import Union
+import string
+from typing import Callable, Union
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
@@ -172,7 +173,7 @@ def _training_helper(learner,
if isinstance(val_split, float):
if not (0 < val_split < 1):
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
- train, unused = data.split_stratified(train_prop=1 - val_split)
+ train, unused = data.split_stratified(train_prop=1 - val_split,random_state=0)
elif isinstance(val_split, LabelledCollection):
train = data
unused = val_split
@@ -637,6 +638,80 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
+class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
+ """
+ `DyS framework `_ (DyS).
+ DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that
+ minimizes the distance between distributions.
+ Details for the ternary search have been got from
+
+ :param learner: a sklearn's Estimator that generates a binary classifier
+ :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
+ validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
+ :param n_bins: an int with the number of bins to use to compute the histograms.
+ :param distance: an str with a distance already included in the librar (HD or topsoe), of a function
+ that computes the distance between two distributions.
+ :param tol: a float with the tolerance for the ternary search algorithm.
+ """
+
+ def __init__(self, learner: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
+ self.learner = learner
+ self.val_split = val_split
+ self.tol = tol
+ self.distance = distance
+ self.n_bins = n_bins
+
+ def _ternary_search(self, f, left, right, tol):
+ """
+ Find maximum of unimodal function f() within [left, right]
+ """
+ while abs(right - left) >= tol:
+ left_third = left + (right - left) / 3
+ right_third = right - (right - left) / 3
+
+ if f(left_third) > f(right_third):
+ left = left_third
+ else:
+ right = right_third
+
+ # Left and right are the current bounds; the maximum is between them
+ return (left + right) / 2
+
+ def _compute_distance(self, Px_train, Px_test, distance: Union[str, Callable]='HD'):
+ if distance=='HD':
+ return F.HellingerDistance(Px_train, Px_test)
+ elif distance=='topsoe':
+ return F.TopsoeDistance(Px_train, Px_test)
+ else:
+ return distance(Px_train, Px_test)
+
+ def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
+ if val_split is None:
+ val_split = self.val_split
+
+ self._check_binary(data, self.__class__.__name__)
+ self.learner, validation = _training_helper(
+ self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+ Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
+ self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
+ self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
+ self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0]
+ self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0]
+ return self
+
+ def aggregate(self, classif_posteriors):
+ Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
+
+ Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
+
+ def distribution_distance(prev):
+ Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
+ return self._compute_distance(Px_train,Px_test,self.distance)
+
+ class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
+ return np.asarray([1 - class1_prev, class1_prev])
+
+
class ELM(AggregativeQuantifier, BinaryQuantifier):
"""
Class of Explicit Loss Minimization (ELM) quantifiers.
From ecd0ad7ec7c40db811c045aea0b7ee3c71e97594 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Mon, 11 Jul 2022 14:00:25 +0200
Subject: [PATCH 23/67] unit test for replicability based on qp.util.temp_seed
---
quapy/method/aggregative.py | 1 +
quapy/tests/test_replicability.py | 30 ++++++++++++++++++++++++++++++
2 files changed, 31 insertions(+)
create mode 100644 quapy/tests/test_replicability.py
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 759a853..e40e96c 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -438,6 +438,7 @@ class PACC(AggregativeProbabilisticQuantifier):
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
+ :param n_jobs: number of parallel workers
"""
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
diff --git a/quapy/tests/test_replicability.py b/quapy/tests/test_replicability.py
new file mode 100644
index 0000000..329ac32
--- /dev/null
+++ b/quapy/tests/test_replicability.py
@@ -0,0 +1,30 @@
+import unittest
+import quapy as qp
+from quapy.functional import strprev
+from sklearn.linear_model import LogisticRegression
+
+from method.aggregative import PACC
+
+
+class MyTestCase(unittest.TestCase):
+ def test_replicability(self):
+
+ dataset = qp.datasets.fetch_UCIDataset('yeast')
+
+ with qp.util.temp_seed(0):
+ lr = LogisticRegression(random_state=0, max_iter=10000)
+ pacc = PACC(lr)
+ prev = pacc.fit(dataset.training).quantify(dataset.test.X)
+ str_prev1 = strprev(prev, prec=5)
+
+ with qp.util.temp_seed(0):
+ lr = LogisticRegression(random_state=0, max_iter=10000)
+ pacc = PACC(lr)
+ prev2 = pacc.fit(dataset.training).quantify(dataset.test.X)
+ str_prev2 = strprev(prev2, prec=5)
+
+ self.assertEqual(str_prev1, str_prev2) # add assertion here
+
+
+if __name__ == '__main__':
+ unittest.main()
From 428f10fb2d09021b34cc7bf2c8d40199f5943f4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Mon, 11 Jul 2022 14:04:28 +0200
Subject: [PATCH 24/67] adding SMM
---
quapy/method/aggregative.py | 41 ++++++++++++++++++++++++++++++++++++-
1 file changed, 40 insertions(+), 1 deletion(-)
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index ac6fdc3..a2e03ae 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -173,7 +173,7 @@ def _training_helper(learner,
if isinstance(val_split, float):
if not (0 < val_split < 1):
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
- train, unused = data.split_stratified(train_prop=1 - val_split,random_state=0)
+ train, unused = data.split_stratified(train_prop=1 - val_split)
elif isinstance(val_split, LabelledCollection):
train = data
unused = val_split
@@ -712,6 +712,45 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
+class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
+ """
+ `SMM method `_ (SMM).
+ SMM is a simplification of matching distribution methods where the representation of the examples
+ is created using the mean instead of a histogram.
+
+ :param learner: a sklearn's Estimator that generates a binary classifier.
+ :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
+ validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
+ """
+
+ def __init__(self, learner: BaseEstimator, val_split=0.4):
+ self.learner = learner
+ self.val_split = val_split
+
+ def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
+ if val_split is None:
+ val_split = self.val_split
+
+ self._check_binary(data, self.__class__.__name__)
+ self.learner, validation = _training_helper(
+ self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+ Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
+ self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
+ self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
+ self.Pxy1_mean = np.mean(self.Pxy1)
+ self.Pxy0_mean = np.mean(self.Pxy0)
+ return self
+
+ def aggregate(self, classif_posteriors):
+ Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
+ Px_mean = np.mean(Px)
+
+ class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean)
+ class1_prev = np.clip(class1_prev, 0, 1)
+
+ return np.asarray([1 - class1_prev, class1_prev])
+
+
class ELM(AggregativeQuantifier, BinaryQuantifier):
"""
Class of Explicit Loss Minimization (ELM) quantifiers.
From c91961cff5b12bc8631602ff367d7d3cce4d2904 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Mon, 11 Jul 2022 14:10:04 +0200
Subject: [PATCH 25/67] adding to __init__.py
---
quapy/method/__init__.py | 1 +
quapy/method/aggregative.py | 1 -
2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py
index 8a30451..01c19bc 100644
--- a/quapy/method/__init__.py
+++ b/quapy/method/__init__.py
@@ -20,6 +20,7 @@ AGGREGATIVE_METHODS = {
aggregative.EMQ,
aggregative.HDy,
aggregative.DyS,
+ aggregative.SMM,
aggregative.X,
aggregative.T50,
aggregative.MAX,
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index a2e03ae..7ab73fb 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -1,6 +1,5 @@
from abc import abstractmethod
from copy import deepcopy
-import string
from typing import Callable, Union
import numpy as np
from joblib import Parallel, delayed
From a4584b79dbf30517f86effccc6208ed28c36d396 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Mon, 11 Jul 2022 16:27:02 +0200
Subject: [PATCH 26/67] changing gridsearchQ to ensure reproducibility
---
quapy/model_selection.py | 3 ++-
quapy/util.py | 19 ++++++++++++++-----
2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index d627649..41a7a19 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -83,7 +83,8 @@ class GridSearchQ(BaseQuantifier):
tinit = time()
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
- scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=self.n_jobs)
+ #pass a seed to parallel so it is set in clild processes
+ scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), seed=qp.environ.get('_R_SEED', None), n_jobs=self.n_jobs)
for params, score, model in scores:
if score is not None:
diff --git a/quapy/util.py b/quapy/util.py
index 2ccf06d..94187e6 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -5,6 +5,7 @@ import os
import pickle
import urllib
from pathlib import Path
+from contextlib import ExitStack
import quapy as qp
import numpy as np
@@ -36,7 +37,7 @@ def map_parallel(func, args, n_jobs):
return list(itertools.chain.from_iterable(results))
-def parallel(func, args, n_jobs):
+def parallel(func, args, n_jobs, seed = None):
"""
A wrapper of multiprocessing:
@@ -44,14 +45,20 @@ def parallel(func, args, n_jobs):
>>> delayed(func)(args_i) for args_i in args
>>> )
- that takes the `quapy.environ` variable as input silently
+ that takes the `quapy.environ` variable as input silently.
+ Seeds the child processes to ensure reproducibility when n_jobs>1
"""
- def func_dec(environ, *args):
+ def func_dec(environ, seed, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
- return func(*args)
+ #set a context with a temporal seed to ensure results are reproducibles in parallel
+ with ExitStack() as stack:
+ if seed is not None:
+ stack.enter_context(qp.util.temp_seed(seed))
+ return func(*args)
+
return Parallel(n_jobs=n_jobs)(
- delayed(func_dec)(qp.environ, args_i) for args_i in args
+ delayed(func_dec)(qp.environ, None if seed is None else seed+i, args_i) for i, args_i in enumerate(args)
)
@@ -66,6 +73,8 @@ def temp_seed(random_state):
:param random_state: the seed to set within the "with" context
"""
state = np.random.get_state()
+ #save the seed just in case is needed (for instance for setting the seed to child processes)
+ qp.environ['_R_SEED'] = random_state
np.random.seed(random_state)
try:
yield
From 1890e2005736c296d498a22b581bd4ef20a3ef84 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Mon, 29 Aug 2022 12:03:03 +0200
Subject: [PATCH 27/67] Update README.md
---
README.md | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/README.md b/README.md
index 10c769f..5f0a73f 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,20 @@ for facilitating the analysis and interpretation of the experimental results.
pip install quapy
```
+### Cite QuaPy
+
+If you find QuaPy useful (and we hope you will), plese consider citing the original paper in your research:
+
+```
+@inproceedings{moreo2021quapy,
+ title={QuaPy: a python-based framework for quantification},
+ author={Moreo, Alejandro and Esuli, Andrea and Sebastiani, Fabrizio},
+ booktitle={Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
+ pages={4534--4543},
+ year={2021}
+}
+```
+
## A quick example:
The following script fetches a dataset of tweets, trains, applies, and evaluates a quantifier based on the
From 3af7c70a5387d4a9a916850c6a8000c0f9669880 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 4 Oct 2022 09:12:51 +0200
Subject: [PATCH 28/67] restoring the default legend in diag plot
---
quapy/plot.py | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index cdb9b1e..6491256 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -49,9 +49,9 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
table = {method_name:[true_prev, estim_prev] for method_name, true_prev, estim_prev in order}
order = [(method_name, *table[method_name]) for method_name in method_order]
- cm = plt.get_cmap('tab20')
+ cm = plt.get_cmap('tab10')
NUM_COLORS = len(method_names)
- ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
+ # ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
for method, true_prev, estim_prev in order:
true_prev = true_prev[:,pos_class]
estim_prev = estim_prev[:,pos_class]
@@ -74,13 +74,13 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
ax.set_xlim(0, 1)
if legend:
- # box = ax.get_position()
- # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
- # ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
- # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
- ax.legend(loc='lower center',
- bbox_to_anchor=(1, -0.5),
- ncol=(len(method_names)+1)//2)
+ box = ax.get_position()
+ ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+ ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+ ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+ # ax.legend(loc='lower center',
+ # bbox_to_anchor=(1, -0.5),
+ # ncol=(len(method_names)+1)//2)
_save_or_show(savepath)
From e40c40960987da110e402d918020d690266e1d6f Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 4 Oct 2022 11:03:08 +0200
Subject: [PATCH 29/67] bugfix in NeuralClassifierTrainer; it was only
configured to work well in binary problems
---
quapy/classification/neural.py | 4 ++--
quapy/data/preprocessing.py | 2 +-
quapy/method/aggregative.py | 5 ++++-
quapy/method/neural.py | 1 +
4 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/quapy/classification/neural.py b/quapy/classification/neural.py
index 0d576c5..18fd646 100644
--- a/quapy/classification/neural.py
+++ b/quapy/classification/neural.py
@@ -42,7 +42,7 @@ class NeuralClassifierTrainer:
batch_size=64,
batch_size_test=512,
padding_length=300,
- device='cpu',
+ device='cuda',
checkpointpath='../checkpoint/classifier_net.dat'):
super().__init__()
@@ -62,7 +62,6 @@ class NeuralClassifierTrainer:
}
self.learner_hyperparams = self.net.get_params()
self.checkpointpath = checkpointpath
- self.classes_ = np.asarray([0, 1])
print(f'[NeuralNetwork running on {device}]')
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
@@ -174,6 +173,7 @@ class NeuralClassifierTrainer:
:return:
"""
train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
+ self.classes_ = train.classes_
opt = self.trainer_hyperparams
checkpoint = self.checkpointpath
self.reset_net_params(self.vocab_size, train.n_classes)
diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py
index f04f010..99a267b 100644
--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@@ -184,7 +184,7 @@ class IndexTransformer:
def _index(self, documents):
vocab = self.vocabulary_.copy()
- return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
+ return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
def fit_transform(self, X, n_jobs=-1):
"""
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index bb71525..c0029ac 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -282,6 +282,7 @@ class ACC(AggregativeQuantifier):
"""
if val_split is None:
val_split = self.val_split
+ classes = data.classes_
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
@@ -300,6 +301,7 @@ class ACC(AggregativeQuantifier):
y = np.concatenate(y)
y_ = np.concatenate(y_)
class_count = data.counts()
+ classes = data.classes_
# fit the learner on all data
self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
@@ -308,10 +310,11 @@ class ACC(AggregativeQuantifier):
self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
y_ = self.learner.predict(val_data.instances)
y = val_data.labels
+ classes = val_data.classes_
self.cc = CC(self.learner)
- self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
+ self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self
diff --git a/quapy/method/neural.py b/quapy/method/neural.py
index bf1f375..b42ada7 100644
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@@ -82,6 +82,7 @@ class QuaNetTrainer(BaseQuantifier):
assert hasattr(learner, 'predict_proba'), \
f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
f'since it does not implement the method "predict_proba"'
+ assert sample_size is not None, 'sample_size cannot be None'
self.learner = learner
self.sample_size = sample_size
self.n_epochs = n_epochs
From a4c33a8e4dd7d426c050e94e8e62b62c045d18a6 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 4 Oct 2022 17:44:16 +0200
Subject: [PATCH 30/67] import fix
---
quapy/model_selection.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 86e79f3..0cd5d4e 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -11,7 +11,7 @@ from quapy.evaluation import artificial_prevalence_prediction, natural_prevalenc
from quapy.method.aggregative import BaseQuantifier
import inspect
-from util import _check_sample_size
+from quapy.util import _check_sample_size
class GridSearchQ(BaseQuantifier):
From f2550fdb829b8c324053af488fc5019dadd70537 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 4 Nov 2022 15:04:36 +0100
Subject: [PATCH 31/67] full example of training, model selection, and
evaluation using the lequa2022 dataset with the new protocols
---
examples/lequa2022_experiments.py | 26 ++++++++++++++++++++++++++
quapy/data/datasets.py | 17 ++++++++++++++++-
quapy/error.py | 10 +++++-----
quapy/evaluation.py | 2 +-
4 files changed, 48 insertions(+), 7 deletions(-)
create mode 100644 examples/lequa2022_experiments.py
diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py
new file mode 100644
index 0000000..790e2c1
--- /dev/null
+++ b/examples/lequa2022_experiments.py
@@ -0,0 +1,26 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import quapy as qp
+from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
+from evaluation import evaluation_report
+from method.aggregative import EMQ
+from model_selection import GridSearchQ
+
+
+task = 'T1A'
+
+qp.environ['SAMPLE_SIZE']=LEQUA2022_SAMPLE_SIZE[task]
+training, val_generator, test_generator = fetch_lequa2022(task=task)
+
+# define the quantifier
+quantifier = EMQ(learner=LogisticRegression())
+
+# model selection
+param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
+model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, n_jobs=-1, refit=False, verbose=True)
+quantifier = model_selection.fit(training)
+
+# evaluation
+report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
+
+print(report)
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 8e58540..b35343b 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -12,6 +12,7 @@ from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
+
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
'semeval13', 'semeval14', 'semeval15', 'semeval16',
@@ -45,6 +46,20 @@ UCI_DATASETS = ['acute.a', 'acute.b',
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
+_TXA_SAMPLE_SIZE = 250
+_TXB_SAMPLE_SIZE = 1000
+
+LEQUA2022_SAMPLE_SIZE = {
+ 'TXA': _TXA_SAMPLE_SIZE,
+ 'TXB': _TXB_SAMPLE_SIZE,
+ 'T1A': _TXA_SAMPLE_SIZE,
+ 'T1B': _TXB_SAMPLE_SIZE,
+ 'T2A': _TXA_SAMPLE_SIZE,
+ 'T2B': _TXB_SAMPLE_SIZE,
+ 'binary': _TXA_SAMPLE_SIZE,
+ 'multiclass': _TXB_SAMPLE_SIZE
+}
+
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
@@ -578,7 +593,7 @@ def fetch_lequa2022(task, data_home=None):
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
- test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
+ test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
diff --git a/quapy/error.py b/quapy/error.py
index 3375470..2047929 100644
--- a/quapy/error.py
+++ b/quapy/error.py
@@ -11,11 +11,11 @@ def from_name(err_name):
"""
assert err_name in ERROR_NAMES, f'unknown error {err_name}'
callable_error = globals()[err_name]
- if err_name in QUANTIFICATION_ERROR_SMOOTH_NAMES:
- eps = __check_eps()
- def bound_callable_error(y_true, y_pred):
- return callable_error(y_true, y_pred, eps)
- return bound_callable_error
+ # if err_name in QUANTIFICATION_ERROR_SMOOTH_NAMES:
+ # eps = __check_eps()
+ # def bound_callable_error(y_true, y_pred):
+ # return callable_error(y_true, y_pred, eps)
+ # return bound_callable_error
return callable_error
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index 57c2ed1..95193aa 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -41,7 +41,7 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
true_prevs, estim_prevs = [], []
- for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
+ for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol():
estim_prevs.append(quantification_fn(sample_instances))
true_prevs.append(sample_prev)
From 6cb9f388e0ecc05ed5f19cdb8bfee147237b04c5 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 4 Nov 2022 15:06:08 +0100
Subject: [PATCH 32/67] full example of training, model selection, and
evaluation using the lequa2022 dataset with the new protocols
---
examples/lequa2022_experiments.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py
index 790e2c1..91849e5 100644
--- a/examples/lequa2022_experiments.py
+++ b/examples/lequa2022_experiments.py
@@ -9,7 +9,7 @@ from model_selection import GridSearchQ
task = 'T1A'
-qp.environ['SAMPLE_SIZE']=LEQUA2022_SAMPLE_SIZE[task]
+qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task]
training, val_generator, test_generator = fetch_lequa2022(task=task)
# define the quantifier
From eafc82c96a943f43a3207665ab93f2c831ce81a9 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 4 Nov 2022 15:15:12 +0100
Subject: [PATCH 33/67] full example of training, model selection, and
evaluation using the lequa2022 dataset with the new protocols
---
examples/lequa2022_experiments.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py
index 91849e5..0df7d15 100644
--- a/examples/lequa2022_experiments.py
+++ b/examples/lequa2022_experiments.py
@@ -5,6 +5,7 @@ from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
from evaluation import evaluation_report
from method.aggregative import EMQ
from model_selection import GridSearchQ
+import pandas as pd
task = 'T1A'
@@ -21,6 +22,8 @@ model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, n_
quantifier = model_selection.fit(training)
# evaluation
-report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae'], verbose=True)
+report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 1000)
print(report)
From fb79a292042ad71e9a694224c744a61e1ab55a65 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 8 Nov 2022 16:36:52 +0100
Subject: [PATCH 34/67] todos and change log
---
TODO.txt | 3 ++-
quapy/CHANGE_LOG.txt | 4 +++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/TODO.txt b/TODO.txt
index c20e901..90f3301 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,7 +1,8 @@
sample_size should not be mandatory when qp.environ['SAMPLE_SIZE'] has been specified
clean all the cumbersome methods that have to be implemented for new quantifiers (e.g., n_classes_ prop, etc.)
make truly parallel the GridSearchQ
-abstract protocols
+make more examples in the "examples" directory
+merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub!
Packaging:
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index 095bb76..6bef8b0 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -9,7 +9,7 @@
- ACC, PACC, Forman's threshold variants have been parallelized.
-- Exploration of hyperparameters in Model selection can now be run in parallel (it was a n_jobs argument in
+- Exploration of hyperparameters in Model selection can now be run in parallel (there was a n_jobs argument in
QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel).
- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that
@@ -20,6 +20,8 @@
- n_jobs is now taken from the environment if set to None
+- examples directory created!
+
Things to fix:
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
From 643a19228bc073e2a74c74967fddf18e67caefd6 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Mon, 28 Nov 2022 12:02:08 +0100
Subject: [PATCH 35/67] data reader for lequa 2022 competition
---
quapy/data/_lequa2022.py | 169 +++++++++++++++++++++++++++++++++++++++
1 file changed, 169 insertions(+)
create mode 100644 quapy/data/_lequa2022.py
diff --git a/quapy/data/_lequa2022.py b/quapy/data/_lequa2022.py
new file mode 100644
index 0000000..79ccccc
--- /dev/null
+++ b/quapy/data/_lequa2022.py
@@ -0,0 +1,169 @@
+from typing import Tuple, Union
+import pandas as pd
+import numpy as np
+import os
+
+from quapy.protocol import AbstractProtocol
+
+DEV_SAMPLES = 1000
+TEST_SAMPLES = 5000
+
+ERROR_TOL = 1E-3
+
+
+def load_category_map(path):
+ cat2code = {}
+ with open(path, 'rt') as fin:
+ for line in fin:
+ category, code = line.split()
+ cat2code[category] = int(code)
+ code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x: x[1])]
+ return cat2code, code2cat
+
+
+def load_raw_documents(path):
+ df = pd.read_csv(path)
+ documents = list(df["text"].values)
+ labels = None
+ if "label" in df.columns:
+ labels = df["label"].values.astype(np.int)
+ return documents, labels
+
+
+def load_vector_documents(path):
+ D = pd.read_csv(path).to_numpy(dtype=np.float)
+ labelled = D.shape[1] == 301
+ if labelled:
+ X, y = D[:, 1:], D[:, 0].astype(np.int).flatten()
+ else:
+ X, y = D, None
+ return X, y
+
+
+class SamplesFromDir(AbstractProtocol):
+
+ def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
+ self.path_dir = path_dir
+ self.load_fn = load_fn
+ self.true_prevs = ResultSubmission.load(ground_truth_path)
+
+ def __call__(self):
+ for id, prevalence in self.true_prevs.iterrows():
+ sample, _ = self.load_fn(os.path.join(self.path_dir, f'{id}.txt'))
+ yield sample, prevalence
+
+
+class ResultSubmission:
+
+ def __init__(self):
+ self.df = None
+
+ def __init_df(self, categories: int):
+ if not isinstance(categories, int) or categories < 2:
+ raise TypeError('wrong format for categories: an int (>=2) was expected')
+ df = pd.DataFrame(columns=list(range(categories)))
+ df.index.set_names('id', inplace=True)
+ self.df = df
+
+ @property
+ def n_categories(self):
+ return len(self.df.columns.values)
+
+ def add(self, sample_id: int, prevalence_values: np.ndarray):
+ if not isinstance(sample_id, int):
+ raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}')
+ if not isinstance(prevalence_values, np.ndarray):
+ raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}')
+ if self.df is None:
+ self.__init_df(categories=len(prevalence_values))
+ if sample_id in self.df.index.values:
+ raise ValueError(f'error: prevalence values for "{sample_id}" already added')
+ if prevalence_values.ndim != 1 and prevalence_values.size != self.n_categories:
+ raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
+ if (prevalence_values < 0).any() or (prevalence_values > 1).any():
+ raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"')
+ if np.abs(prevalence_values.sum() - 1) > ERROR_TOL:
+ raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"'
+ f'(error tolerance {ERROR_TOL})')
+
+ self.df.loc[sample_id] = prevalence_values
+
+ def __len__(self):
+ return len(self.df)
+
+ @classmethod
+ def load(cls, path: str) -> 'ResultSubmission':
+ df = ResultSubmission.check_file_format(path)
+ r = ResultSubmission()
+ r.df = df
+ return r
+
+ def dump(self, path: str):
+ ResultSubmission.check_dataframe_format(self.df)
+ self.df.to_csv(path)
+
+ def prevalence(self, sample_id: int):
+ sel = self.df.loc[sample_id]
+ if sel.empty:
+ return None
+ else:
+ return sel.values.flatten()
+
+ def iterrows(self):
+ for index, row in self.df.iterrows():
+ prevalence = row.values.flatten()
+ yield index, prevalence
+
+ @classmethod
+ def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
+ try:
+ df = pd.read_csv(path, index_col=0)
+ except Exception as e:
+ print(f'the file {path} does not seem to be a valid csv file. ')
+ print(e)
+ return ResultSubmission.check_dataframe_format(df, path=path)
+
+ @classmethod
+ def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
+ hint_path = '' # if given, show the data path in the error message
+ if path is not None:
+ hint_path = f' in {path}'
+
+ if df.index.name != 'id' or len(df.columns) < 2:
+ raise ValueError(f'wrong header{hint_path}, '
+ f'the format of the header should be "id,0,...,n-1", '
+ f'where n is the number of categories')
+ if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))):
+ raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, '
+ f'where n is the number of categories')
+ if df.empty:
+ raise ValueError(f'error{hint_path}: results file is empty')
+ elif len(df) != DEV_SAMPLES and len(df) != TEST_SAMPLES:
+ raise ValueError(f'wrong number of prevalence values found{hint_path}; '
+ f'expected {DEV_SAMPLES} for development sets and '
+ f'{TEST_SAMPLES} for test sets; found {len(df)}')
+
+ ids = set(df.index.values)
+ expected_ids = set(range(len(df)))
+ if ids != expected_ids:
+ missing = expected_ids - ids
+ if missing:
+ raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}')
+ unexpected = ids - expected_ids
+ if unexpected:
+ raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}')
+
+ for category_id in df.columns:
+ if (df[category_id] < 0).any() or (df[category_id] > 1).any():
+ raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]')
+
+ prevs = df.values
+ round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ERROR_TOL
+ if round_errors.any():
+ raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
+ f'do not sum up to 1 (error tolerance {ERROR_TOL}), '
+ f'probably due to some rounding errors.')
+
+ return df
+
+
From eb860e9678c396d5ce5bcba703fb8e36a4ad0403 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Mon, 12 Dec 2022 09:34:09 +0100
Subject: [PATCH 36/67] adding the possibility to estimate the training
prevalence, instead of using the true training prevalence, as a starting
point in emq
---
examples/lequa2022_experiments.py | 13 ++++++++++---
quapy/method/aggregative.py | 12 ++++++++++--
2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py
index 0df7d15..31ec651 100644
--- a/examples/lequa2022_experiments.py
+++ b/examples/lequa2022_experiments.py
@@ -1,6 +1,8 @@
import numpy as np
+from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import quapy as qp
+import quapy.functional as F
from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
from evaluation import evaluation_report
from method.aggregative import EMQ
@@ -14,7 +16,8 @@ qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task]
training, val_generator, test_generator = fetch_lequa2022(task=task)
# define the quantifier
-quantifier = EMQ(learner=LogisticRegression())
+learner = CalibratedClassifierCV(LogisticRegression())
+quantifier = EMQ(learner=learner)
# model selection
param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
@@ -24,6 +27,10 @@ quantifier = model_selection.fit(training)
# evaluation
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
-pd.set_option('display.max_columns', None)
-pd.set_option('display.width', 1000)
+# printing results
+pd.set_option('display.expand_frame_repr', False)
+report['estim-prev'] = report['estim-prev'].map(F.strprev)
print(report)
+
+print('Averaged values:')
+print(report.mean())
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 19d365b..202b5dd 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -501,17 +501,25 @@ class EMQ(AggregativeProbabilisticQuantifier):
maximum-likelihood estimation, in a mutually recursive way, until convergence.
:param learner: a sklearn's Estimator that generates a classifier
+ :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
+ or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
+ value of the posterior probabilities of the trianing documents as suggested in
+ `Alexandari et al. paper `_:
"""
MAX_ITER = 1000
EPSILON = 1e-4
- def __init__(self, learner: BaseEstimator):
+ def __init__(self, learner: BaseEstimator, exact_train_prev=True):
self.learner = learner
+ self.exact_train_prev = exact_train_prev
def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
- self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
+ if self.exact_train_prev:
+ self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
+ else:
+ self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):
From c20d9d5ea415d1fd67551f9744f797f840e20221 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Mon, 12 Dec 2022 17:32:30 +0100
Subject: [PATCH 37/67] the heuristic exact_train_prev is performed via kFCV,
using a new function qp.model_selection.cross_val_predict
---
TODO.txt | 2 ++
quapy/method/aggregative.py | 11 ++++++++---
quapy/model_selection.py | 29 +++++++++++++++++++++++++++++
3 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/TODO.txt b/TODO.txt
index 90f3301..6cef78c 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -3,6 +3,8 @@ clean all the cumbersome methods that have to be implemented for new quantifiers
make truly parallel the GridSearchQ
make more examples in the "examples" directory
merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub!
+added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
+ it parallelized
Packaging:
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 202b5dd..4cec2cd 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -3,7 +3,7 @@ from copy import deepcopy
from typing import Callable, Union
import numpy as np
from joblib import Parallel, delayed
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
@@ -503,7 +503,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
:param learner: a sklearn's Estimator that generates a classifier
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
- value of the posterior probabilities of the trianing documents as suggested in
+ value of the posterior probabilities of the training instances as suggested in
`Alexandari et al. paper `_:
"""
@@ -519,7 +519,12 @@ class EMQ(AggregativeProbabilisticQuantifier):
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else:
- self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
+ self.train_prevalence = qp.model_selection.cross_val_predict(
+ quantifier=PCC(clone(self.learner)),
+ data=data,
+ nfolds=3,
+ random_state=0
+ )
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 41a7a19..f7c5b94 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -2,6 +2,10 @@ import itertools
import signal
from copy import deepcopy
from typing import Union, Callable
+
+import numpy as np
+from sklearn import clone
+
import quapy as qp
from quapy import evaluation
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
@@ -187,3 +191,28 @@ class GridSearchQ(BaseQuantifier):
raise ValueError('best_model called before fit')
+
+
+def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0):
+ """
+ Akin to `scikit-learn's cross_val_predict `_
+ but for quantification.
+
+ :param quantifier: a quantifier issuing class prevalence values
+ :param data: a labelled collection
+ :param nfolds: number of folds for k-fold cross validation generation
+ :param random_state: random seed for reproducibility
+ :return: a vector of class prevalence values
+ """
+
+ total_prev = np.zeros(shape=data.n_classes)
+
+ for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
+ quantifier.fit(train)
+ fold_prev = quantifier.quantify(test.X)
+ rel_size = len(test.X)/len(data)
+ total_prev += fold_prev*rel_size
+
+ return total_prev
+
+
From bb7a77c7c094f847020d54d6b12fb47f5ae44de7 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 13 Dec 2022 16:57:11 +0100
Subject: [PATCH 38/67] missing param in documentation of some protocols
---
quapy/protocol.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 7652eeb..b30165f 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -134,6 +134,8 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param repeats: number of copies for each valid prevalence vector (default is 10)
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
:param random_state: allows replicating samples across runs (default None)
+ :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or
+ to "labelled_collection" to get instead instances of LabelledCollection
"""
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, smooth_limits_epsilon=0, random_state=None, return_type='sample_prev'):
@@ -192,6 +194,8 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100.
:param random_state: allows replicating samples across runs (default None)
+ :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or
+ to "labelled_collection" to get instead instances of LabelledCollection
"""
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
@@ -229,6 +233,8 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100.
:param random_state: allows replicating samples across runs (default None)
+ :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or
+ to "labelled_collection" to get instead instances of LabelledCollection
"""
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
From 8b0b9f522a4babf4d52ec82bd7a7058deaa43e45 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Mon, 16 Jan 2023 13:51:29 +0100
Subject: [PATCH 39/67] some bugfixes, unittest and minor changes
---
quapy/CHANGE_LOG.txt | 12 ++++++++++++
quapy/data/_lequa2022.py | 6 +++---
quapy/data/base.py | 37 +++++++++++++++++++++++++++++++++++--
quapy/data/reader.py | 4 ++--
quapy/functional.py | 2 +-
quapy/method/aggregative.py | 8 ++++++--
quapy/plot.py | 2 +-
quapy/util.py | 3 ++-
8 files changed, 62 insertions(+), 12 deletions(-)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index 6bef8b0..06d7dc4 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -22,6 +22,18 @@
- examples directory created!
+- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a
+ test protocol maybe, or None for bypassing it?
+
+- I think Pablo added DyS, Topsoe distance and binary search.
+
+- I think Pablo added multi-thread reproducibility.
+
+- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes
+
+- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances
+ with the plain python type (e.g., float).
+
Things to fix:
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
diff --git a/quapy/data/_lequa2022.py b/quapy/data/_lequa2022.py
index 79ccccc..449eab6 100644
--- a/quapy/data/_lequa2022.py
+++ b/quapy/data/_lequa2022.py
@@ -26,15 +26,15 @@ def load_raw_documents(path):
documents = list(df["text"].values)
labels = None
if "label" in df.columns:
- labels = df["label"].values.astype(np.int)
+ labels = df["label"].values.astype(int)
return documents, labels
def load_vector_documents(path):
- D = pd.read_csv(path).to_numpy(dtype=np.float)
+ D = pd.read_csv(path).to_numpy(dtype=float)
labelled = D.shape[1] == 301
if labelled:
- X, y = D[:, 1:], D[:, 0].astype(np.int).flatten()
+ X, y = D[:, 1:], D[:, 0].astype(int).flatten()
else:
X, y = D, None
return X, y
diff --git a/quapy/data/base.py b/quapy/data/base.py
index 3c9bb67..62f871d 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -1,3 +1,5 @@
+from functools import cached_property
+
import numpy as np
from scipy.sparse import issparse
from scipy.sparse import vstack
@@ -223,13 +225,44 @@ class LabelledCollection:
test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
return training, test
+
+ def split_random(self, train_prop=0.6, random_state=None):
+ """
+ Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired
+ proportion.
+
+ :param train_prop: the proportion of elements to include in the left-most returned collection (typically used
+ as the training collection). The rest of elements are included in the right-most returned collection
+ (typically used as a test collection).
+ :param random_state: if specified, guarantees reproducibility of the split.
+ :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
+ second one with `1-train_prop` elements
+ """
+ indexes = np.random.RandomState(seed=random_state).permutation(len(self))
+ if isinstance(train_prop, int):
+ assert train_prop < len(self), \
+ 'argument train_prop cannot be greater than the number of elements in the collection'
+ splitpoint = train_prop
+ elif isinstance(train_prop, float):
+ assert 0 < train_prop < 1, \
+ 'argument train_prop out of range (0,1)'
+ splitpoint = int(np.round(len(self)*train_prop))
+ left, right = indexes[:splitpoint], indexes[splitpoint:]
+ training = self.sampling_from_index(left)
+ test = self.sampling_from_index(right)
+ return training, test
+
def __add__(self, other):
"""
- Returns a new :class:`LabelledCollection` as the union of this collection with another collection
+ Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
+ Both labelled collections must have the same classes.
:param other: another :class:`LabelledCollection`
:return: a :class:`LabelledCollection` representing the union of both collections
"""
+ if not all(np.sort(self.classes_)==np.sort(other.classes_)):
+ raise NotImplementedError('unsupported operation for collections on different classes')
+
if other is None:
return self
elif issparse(self.instances) and issparse(other.instances):
@@ -241,7 +274,7 @@ class LabelledCollection:
else:
raise NotImplementedError('unsupported operation for collection types')
labels = np.concatenate([self.labels, other.labels])
- return LabelledCollection(join_instances, labels)
+ return LabelledCollection(join_instances, labels, classes_=self.classes_)
@property
def Xy(self):
diff --git a/quapy/data/reader.py b/quapy/data/reader.py
index 8f8bc79..88791e3 100644
--- a/quapy/data/reader.py
+++ b/quapy/data/reader.py
@@ -102,7 +102,7 @@ def reindex_labels(y):
y = np.asarray(y)
classnames = np.asarray(sorted(np.unique(y)))
label2index = {label: index for index, label in enumerate(classnames)}
- indexed = np.empty(y.shape, dtype=np.int)
+ indexed = np.empty(y.shape, dtype=int)
for label in classnames:
indexed[y==label] = label2index[label]
return indexed, classnames
@@ -121,7 +121,7 @@ def binarize(y, pos_class):
0 otherwise
"""
y = np.asarray(y)
- ybin = np.zeros(y.shape, dtype=np.int)
+ ybin = np.zeros(y.shape, dtype=int)
ybin[y == pos_class] = 1
return ybin
diff --git a/quapy/functional.py b/quapy/functional.py
index 8cf0312..3ee46ff 100644
--- a/quapy/functional.py
+++ b/quapy/functional.py
@@ -39,7 +39,7 @@ def prevalence_from_labels(labels, classes):
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
- prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=np.float)
+ prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float)
prevalences /= prevalences.sum()
return prevalences
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 4cec2cd..57c821d 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -132,7 +132,11 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV):
- parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
+ if self.learner.get_params().get('base_estimator') == 'deprecated':
+ key_prefix = 'estimator__' # this has changed in the newer versions of sklearn
+ else:
+ key_prefix = 'base_estimator__'
+ parameters = {key_prefix + k: v for k, v in parameters.items()}
self.learner.set_params(**parameters)
@@ -369,7 +373,7 @@ class ACC(AggregativeQuantifier):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
conf = confusion_matrix(y, y_, labels=classes).T
- conf = conf.astype(np.float)
+ conf = conf.astype(float)
class_counts = conf.sum(axis=0)
for i, _ in enumerate(classes):
if class_counts[i] == 0:
diff --git a/quapy/plot.py b/quapy/plot.py
index cdb9b1e..67ccd52 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -370,7 +370,7 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
bins[-1] += 0.001
# we use this to keep track of how many datapoits contribute to each bin
- inds_histogram_global = np.zeros(n_bins, dtype=np.float)
+ inds_histogram_global = np.zeros(n_bins, dtype=float)
n_methods = len(method_order)
buckets = np.zeros(shape=(n_methods, n_bins, 3))
for i, method in enumerate(method_order):
diff --git a/quapy/util.py b/quapy/util.py
index 94187e6..50a640d 100644
--- a/quapy/util.py
+++ b/quapy/util.py
@@ -23,7 +23,8 @@ def _get_parallel_slices(n_tasks, n_jobs):
def map_parallel(func, args, n_jobs):
"""
Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
- func is applied in two parallel processes to args[0:50] and to args[50:99]
+ func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function
+ that already works with a list of arguments.
:param func: function to be parallelized
:param args: array-like of arguments to be passed to the function in different parallel calls
From 948f63fade88bee27b545e12bd8ce667a192d9ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Mon, 16 Jan 2023 17:00:24 +0100
Subject: [PATCH 40/67] updating plot to center it better
---
quapy/plot.py | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index cdb9b1e..7d032f1 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -212,6 +212,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
n_bins=20, error_name='ae', show_std=False,
show_density=True,
+ show_legend=True,
logscale=False,
title=f'Quantification error as a function of distribution shift',
vlines=None,
@@ -234,6 +235,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
:param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae")
:param show_std: whether or not to show standard deviations as color bands (default is False)
:param show_density: whether or not to display the distribution of experiments for each bin (default is True)
+ :param show_density: whether or not to display the legend of the chart (default is True)
:param logscale: whether or not to log-scale the y-error measure (default is False)
:param title: title of the plot (default is "Quantification error as a function of distribution shift")
:param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space
@@ -306,7 +308,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if vlines:
for vline in vlines:
ax.axvline(vline, 0, 1, linestyle='--', color='k')
- ax.set_xlim(0, max_x)
+
+ if not show_legend:
+ ax.get_legend().remove()
+
+ ax.set_xlim(min_x, max_x)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
_save_or_show(savepath)
From 7bcf8b24e9e08b05c3390f3667cb3482dfe93c1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Mon, 16 Jan 2023 17:17:02 +0100
Subject: [PATCH 41/67] fixing bug
---
quapy/plot.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index 7d032f1..794fd4c 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -309,11 +309,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
for vline in vlines:
ax.axvline(vline, 0, 1, linestyle='--', color='k')
- if not show_legend:
- ax.get_legend().remove()
ax.set_xlim(min_x, max_x)
- ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
+ if not show_legend:
+ ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
_save_or_show(savepath)
From c888346fcffd1f4f40843615148e6519a6e6419c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Tue, 17 Jan 2023 11:03:52 +0100
Subject: [PATCH 42/67] solving a bug in show_legend
---
quapy/plot.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index 794fd4c..c1a857e 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -312,7 +312,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax.set_xlim(min_x, max_x)
- if not show_legend:
+ if show_legend:
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
_save_or_show(savepath)
From 6e910075ab490ef7640ed949a51bef7ec65fd2ef Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 17 Jan 2023 13:53:48 +0100
Subject: [PATCH 43/67] adding calibration methods from abstension package
---
quapy/CHANGE_LOG.txt | 3 +++
quapy/method/aggregative.py | 2 ++
2 files changed, 5 insertions(+)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index 06d7dc4..20e0759 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -34,7 +34,10 @@
- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances
with the plain python type (e.g., float).
+- new dependency "abstention" (to add to the project requirements and setup)
+
Things to fix:
+- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
path of the imported class wrt the path of the class that arrives from another module...
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 57c821d..9e5338d 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -10,6 +10,7 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
import quapy as qp
import quapy.functional as F
+from classification.calibration import RecalibratedClassifier
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
@@ -137,6 +138,7 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
else:
key_prefix = 'base_estimator__'
parameters = {key_prefix + k: v for k, v in parameters.items()}
+
self.learner.set_params(**parameters)
From 50d886bffe7f3def0ce49388b299b915a4f40b5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Wed, 18 Jan 2023 13:06:38 +0100
Subject: [PATCH 44/67] testing log scale
---
quapy/plot.py | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index c1a857e..7b2145f 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -4,6 +4,7 @@ from matplotlib.cm import get_cmap
import numpy as np
from matplotlib import cm
from scipy.stats import ttest_ind_from_stats
+from matplotlib.ticker import StrMethodFormatter, NullFormatter
import quapy as qp
@@ -256,6 +257,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
+ if method_order is None:
+ method_order = method_names
+
_set_colors(ax, n_methods=len(method_order))
bins = np.linspace(0, 1, n_bins+1)
@@ -266,7 +270,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
tr_test_drifts = data[method]['x']
method_drifts = data[method]['y']
if logscale:
- method_drifts=np.log(1+method_drifts)
+ #method_drifts=np.log(1+method_drifts)
+ plt.yscale("log")
+ ax.yaxis.set_major_formatter(StrMethodFormatter('{x:.2f}'))
+ ax.yaxis.set_minor_formatter(StrMethodFormatter('{x:.2f}'))
+
inds = np.digitize(tr_test_drifts, bins, right=True)
@@ -299,7 +307,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if show_density:
ax.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
max_y*npoints/np.max(npoints), alpha=0.15, color='g', width=binwidth, label='density')
-
+
ax.set(xlabel=f'Distribution shift between training set and test sample',
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
title=title)
From f10a3139d9594b38a95234ac8513c5473baaded4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Wed, 18 Jan 2023 14:53:46 +0100
Subject: [PATCH 45/67] changes to plots again
---
quapy/plot.py | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index 7b2145f..2e41413 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -4,7 +4,7 @@ from matplotlib.cm import get_cmap
import numpy as np
from matplotlib import cm
from scipy.stats import ttest_ind_from_stats
-from matplotlib.ticker import StrMethodFormatter, NullFormatter
+from matplotlib.ticker import ScalarFormatter
import quapy as qp
@@ -270,11 +270,11 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
tr_test_drifts = data[method]['x']
method_drifts = data[method]['y']
if logscale:
- #method_drifts=np.log(1+method_drifts)
- plt.yscale("log")
- ax.yaxis.set_major_formatter(StrMethodFormatter('{x:.2f}'))
- ax.yaxis.set_minor_formatter(StrMethodFormatter('{x:.2f}'))
-
+ ax.set_yscale("log")
+ ax.yaxis.set_major_formatter(ScalarFormatter())
+ ax.yaxis.set_minor_formatter(ScalarFormatter())
+ ax.yaxis.get_major_formatter().set_scientific(False)
+ ax.yaxis.get_minor_formatter().set_scientific(False)
inds = np.digitize(tr_test_drifts, bins, right=True)
@@ -305,8 +305,14 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25)
if show_density:
- ax.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
+ ax2 = ax.twinx()
+ ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
max_y*npoints/np.max(npoints), alpha=0.15, color='g', width=binwidth, label='density')
+ #ax2.set_ylabel("bar data")
+ ax2.set_ylim(0,1)
+ ax2.spines['right'].set_color('g')
+ ax2.tick_params(axis='y', colors='g')
+ #ax2.yaxis.set_visible(False)
ax.set(xlabel=f'Distribution shift between training set and test sample',
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
@@ -321,8 +327,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax.set_xlim(min_x, max_x)
if show_legend:
- ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-
+ fig.legend(loc='right')
+
_save_or_show(savepath)
From 7ed7c9b2e94a9da4d0d763c389bde9b612354b75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Wed, 18 Jan 2023 16:05:40 +0100
Subject: [PATCH 46/67] changing the logaritmic scale
---
quapy/plot.py | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index 2e41413..15c7be5 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -5,6 +5,7 @@ import numpy as np
from matplotlib import cm
from scipy.stats import ttest_ind_from_stats
from matplotlib.ticker import ScalarFormatter
+import math
import quapy as qp
@@ -272,9 +273,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if logscale:
ax.set_yscale("log")
ax.yaxis.set_major_formatter(ScalarFormatter())
- ax.yaxis.set_minor_formatter(ScalarFormatter())
ax.yaxis.get_major_formatter().set_scientific(False)
- ax.yaxis.get_minor_formatter().set_scientific(False)
+ ax.minorticks_off()
inds = np.digitize(tr_test_drifts, bins, right=True)
@@ -307,12 +307,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if show_density:
ax2 = ax.twinx()
ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
- max_y*npoints/np.max(npoints), alpha=0.15, color='g', width=binwidth, label='density')
- #ax2.set_ylabel("bar data")
+ npoints/np.sum(npoints), alpha=0.15, color='g', width=binwidth, label='density')
ax2.set_ylim(0,1)
ax2.spines['right'].set_color('g')
ax2.tick_params(axis='y', colors='g')
- #ax2.yaxis.set_visible(False)
ax.set(xlabel=f'Distribution shift between training set and test sample',
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
@@ -325,9 +323,13 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax.set_xlim(min_x, max_x)
+ if logscale:
+ #nice scale for the logaritmic axis
+ ax.set_ylim(0,10 ** math.ceil(math.log10(max_y)))
+
if show_legend:
- fig.legend(loc='right')
+ fig.legend(bbox_to_anchor=(1.05, 1), loc="upper right")
_save_or_show(savepath)
@@ -549,7 +551,7 @@ def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error
method_order = []
for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs):
- tr_prev_i = np.repeat(tr_prev_i.reshape(1, -1), repeats=test_prevs_i.shape[0], axis=0)
+ tr_prev_i = np.repeat(tr_prevs.reshape(1, -1), repeats=test_prevs_i.shape[0], axis=0)
tr_test_drifts = x_error(test_prevs_i, tr_prev_i)
data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts])
From 8da4b4c5f399c764bf96bda36004ebb57f05c548 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Wed, 18 Jan 2023 16:12:38 +0100
Subject: [PATCH 47/67] placing the legend
---
quapy/plot.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index 15c7be5..358bf45 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -306,9 +306,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if show_density:
ax2 = ax.twinx()
+ densities = npoints/np.sum(npoints)
ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
- npoints/np.sum(npoints), alpha=0.15, color='g', width=binwidth, label='density')
- ax2.set_ylim(0,1)
+ densities, alpha=0.15, color='g', width=binwidth, label='density')
+ ax2.set_ylim(0,max(densities))
ax2.spines['right'].set_color('g')
ax2.tick_params(axis='y', colors='g')
@@ -329,7 +330,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if show_legend:
- fig.legend(bbox_to_anchor=(1.05, 1), loc="upper right")
+ fig.legend(loc='lower center',
+ bbox_to_anchor=(1, 0.5),
+ ncol=(len(method_names)+1)//2)
_save_or_show(savepath)
From 38aa42e4c52ef6434ce02918c8263b76372ffedb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20Gonz=C3=A1lez?=
Date: Wed, 18 Jan 2023 16:44:56 +0100
Subject: [PATCH 48/67] fixing a bug
---
quapy/plot.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/quapy/plot.py b/quapy/plot.py
index 358bf45..061ecdc 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -554,7 +554,7 @@ def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error
method_order = []
for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs):
- tr_prev_i = np.repeat(tr_prevs.reshape(1, -1), repeats=test_prevs_i.shape[0], axis=0)
+ tr_prev_i = np.repeat(tr_prev_i.reshape(1, -1), repeats=test_prevs_i.shape[0], axis=0)
tr_test_drifts = x_error(test_prevs_i, tr_prev_i)
data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts])
From 09abcfc935c7a0efe180e8c5bc3e468e29228d56 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Wed, 18 Jan 2023 19:46:19 +0100
Subject: [PATCH 49/67] adding calibration methods from the abstension package
to quapy
---
quapy/CHANGE_LOG.txt | 3 +-
quapy/classification/calibration.py | 166 ++++++++++++++++++++++++++++
quapy/method/aggregative.py | 26 ++++-
quapy/plot.py | 1 -
4 files changed, 191 insertions(+), 5 deletions(-)
create mode 100644 quapy/classification/calibration.py
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index 20e0759..090afc8 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -34,7 +34,8 @@
- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances
with the plain python type (e.g., float).
-- new dependency "abstention" (to add to the project requirements and setup)
+- new dependency "abstention" (to add to the project requirements and setup). Calibration methods from
+ https://github.com/kundajelab/abstention added.
Things to fix:
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
diff --git a/quapy/classification/calibration.py b/quapy/classification/calibration.py
new file mode 100644
index 0000000..9ea5576
--- /dev/null
+++ b/quapy/classification/calibration.py
@@ -0,0 +1,166 @@
+from copy import deepcopy
+
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from sklearn.base import BaseEstimator, clone
+from sklearn.model_selection import cross_val_predict, train_test_split
+import numpy as np
+
+
+# Wrappers of calibration defined by Alexandari et al. in paper
+# requires "pip install abstension"
+# see https://github.com/kundajelab/abstention
+
+
+class RecalibratedClassifier:
+ pass
+
+
+class RecalibratedClassifierBase(BaseEstimator, RecalibratedClassifier):
+ """
+ Applies a (re)calibration method from abstention.calibration, as defined in
+ `Alexandari et al. paper `_:
+
+ :param estimator: a scikit-learn probabilistic classifier
+ :param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory)
+ :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
+ in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
+ training instances (the rest is used for training). In any case, the classifier is retrained in the whole
+ training set afterwards.
+ :param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
+ :param verbose: whether or not to display information in the standard output
+ """
+
+ def __init__(self, estimator, calibrator, val_split=5, n_jobs=1, verbose=False):
+ self.estimator = estimator
+ self.calibrator = calibrator
+ self.val_split = val_split
+ self.n_jobs = n_jobs
+ self.verbose = verbose
+
+ def fit(self, X, y):
+ k = self.val_split
+ if isinstance(k, int):
+ if k < 2:
+ raise ValueError('wrong value for val_split: the number of folds must be > 2')
+ return self.fit_cv(X, y)
+ elif isinstance(k, float):
+ if not (0 < k < 1):
+ raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
+ return self.fit_cv(X, y)
+
+ def fit_cv(self, X, y):
+ posteriors = cross_val_predict(
+ self.estimator, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method="predict_proba"
+ )
+ self.estimator.fit(X, y)
+ nclasses = len(np.unique(y))
+ self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True)
+ return self
+
+ def fit_tr_val(self, X, y):
+ Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y)
+ self.estimator.fit(Xtr, ytr)
+ posteriors = self.estimator.predict_proba(Xva)
+ nclasses = len(np.unique(yva))
+ self.calibrator = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True)
+ return self
+
+ def predict(self, X):
+ return self.estimator.predict(X)
+
+ def predict_proba(self, X):
+ posteriors = self.estimator.predict_proba(X)
+ return self.calibration_function(posteriors)
+
+ @property
+ def classes_(self):
+ return self.estimator.classes_
+
+
+class NBVSCalibration(RecalibratedClassifierBase):
+ """
+ Applies the No-Bias Vector Scaling (NBVS) calibration method from abstention.calibration, as defined in
+ `Alexandari et al. paper `_:
+
+ :param estimator: a scikit-learn probabilistic classifier
+ :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
+ in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
+ training instances (the rest is used for training). In any case, the classifier is retrained in the whole
+ training set afterwards.
+ :param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
+ :param verbose: whether or not to display information in the standard output
+ """
+
+ def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
+ self.estimator = estimator
+ self.calibrator = NoBiasVectorScaling(verbose=verbose)
+ self.val_split = val_split
+ self.n_jobs = n_jobs
+ self.verbose = verbose
+
+
+class BCTSCalibration(RecalibratedClassifierBase):
+ """
+ Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from abstention.calibration, as defined in
+ `Alexandari et al. paper `_:
+
+ :param estimator: a scikit-learn probabilistic classifier
+ :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
+ in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
+ training instances (the rest is used for training). In any case, the classifier is retrained in the whole
+ training set afterwards.
+ :param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
+ :param verbose: whether or not to display information in the standard output
+ """
+
+ def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
+ self.estimator = estimator
+ self.calibrator = TempScaling(verbose=verbose, bias_positions='all')
+ self.val_split = val_split
+ self.n_jobs = n_jobs
+ self.verbose = verbose
+
+
+class TSCalibration(RecalibratedClassifierBase):
+ """
+ Applies the Temperature Scaling (TS) calibration method from abstention.calibration, as defined in
+ `Alexandari et al. paper `_:
+
+ :param estimator: a scikit-learn probabilistic classifier
+ :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
+ in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
+ training instances (the rest is used for training). In any case, the classifier is retrained in the whole
+ training set afterwards.
+ :param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
+ :param verbose: whether or not to display information in the standard output
+ """
+
+ def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
+ self.estimator = estimator
+ self.calibrator = TempScaling(verbose=verbose)
+ self.val_split = val_split
+ self.n_jobs = n_jobs
+ self.verbose = verbose
+
+
+class VSCalibration(RecalibratedClassifierBase):
+ """
+ Applies the Vector Scaling (VS) calibration method from abstention.calibration, as defined in
+ `Alexandari et al. paper `_:
+
+ :param estimator: a scikit-learn probabilistic classifier
+ :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
+ in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
+ training instances (the rest is used for training). In any case, the classifier is retrained in the whole
+ training set afterwards.
+ :param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
+ :param verbose: whether or not to display information in the standard output
+ """
+
+ def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
+ self.estimator = estimator
+ self.calibrator = VectorScaling(verbose=verbose)
+ self.val_split = val_split
+ self.n_jobs = n_jobs
+ self.verbose = verbose
+
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 9e5338d..d77f1ed 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -10,7 +10,8 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
import quapy as qp
import quapy.functional as F
-from classification.calibration import RecalibratedClassifier
+from classification.calibration import RecalibratedClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
+ VSCalibration
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
@@ -138,8 +139,11 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
else:
key_prefix = 'base_estimator__'
parameters = {key_prefix + k: v for k, v in parameters.items()}
+ elif isinstance(self.learner, RecalibratedClassifier):
+ parameters = {'estimator__' + k: v for k, v in parameters.items()}
self.learner.set_params(**parameters)
+ return self
# Helper
@@ -511,22 +515,38 @@ class EMQ(AggregativeProbabilisticQuantifier):
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
value of the posterior probabilities of the training instances as suggested in
`Alexandari et al. paper `_:
+ :param recalib: a string indicating the method of recalibration. Available choices include "nbvs" (No-Bias Vector
+ Scaling), "bcts" (Bias-Corrected Temperature Scaling), "ts" (Temperature Scaling), and "vs" (Vector Scaling).
+ The default value is None, indicating no recalibration.
"""
MAX_ITER = 1000
EPSILON = 1e-4
- def __init__(self, learner: BaseEstimator, exact_train_prev=True):
+ def __init__(self, learner: BaseEstimator, exact_train_prev=True, recalib=None):
self.learner = learner
self.exact_train_prev = exact_train_prev
+ self.recalib = recalib
def fit(self, data: LabelledCollection, fit_learner=True):
+ if self.recalib is not None:
+ if self.recalib == 'nbvs':
+ self.learner = NBVSCalibration(self.learner)
+ elif self.recalib == 'bcts':
+ self.learner = BCTSCalibration(self.learner)
+ elif self.recalib == 'ts':
+ self.learner = TSCalibration(self.learner)
+ elif self.recalib == 'vs':
+ self.learner = VSCalibration(self.learner)
+ else:
+ raise ValueError('invalid param argument for recalibration method; available ones are '
+ '"nbvs", "bcts", "ts", and "vs".')
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else:
self.train_prevalence = qp.model_selection.cross_val_predict(
- quantifier=PCC(clone(self.learner)),
+ quantifier=PCC(deepcopy(self.learner)),
data=data,
nfolds=3,
random_state=0
diff --git a/quapy/plot.py b/quapy/plot.py
index 7d94012..b63eba6 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -323,7 +323,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
for vline in vlines:
ax.axvline(vline, 0, 1, linestyle='--', color='k')
-
ax.set_xlim(min_x, max_x)
if show_legend:
From adf799c8eca88952c4bdc4ca6a5cfbb4e05c8010 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 24 Jan 2023 09:48:21 +0100
Subject: [PATCH 50/67] recalibration
---
examples/lequa2022_experiments_recalib.py | 59 +++++++++++++++++++++++
1 file changed, 59 insertions(+)
create mode 100644 examples/lequa2022_experiments_recalib.py
diff --git a/examples/lequa2022_experiments_recalib.py b/examples/lequa2022_experiments_recalib.py
new file mode 100644
index 0000000..983c781
--- /dev/null
+++ b/examples/lequa2022_experiments_recalib.py
@@ -0,0 +1,59 @@
+import numpy as np
+from abstention.calibration import NoBiasVectorScaling, VectorScaling, TempScaling
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.linear_model import LogisticRegression
+import quapy as qp
+import quapy.functional as F
+from classification.calibration import RecalibratedClassifierBase, NBVSCalibration, \
+ BCTSCalibration
+from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
+from evaluation import evaluation_report
+from method.aggregative import EMQ
+from model_selection import GridSearchQ
+import pandas as pd
+
+for task in ['T1A', 'T1B']:
+ for calib in ['NoCal', 'TS', 'VS', 'NBVS', 'NBTS']:
+
+ # calibration = TempScaling(verbose=False, bias_positions='all')
+
+ qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task]
+ training, val_generator, test_generator = fetch_lequa2022(task=task)
+
+ # define the quantifier
+ # learner = BCTSCalibration(LogisticRegression(), n_jobs=-1)
+ # learner = CalibratedClassifierCV(LogisticRegression())
+ learner = LogisticRegression()
+ quantifier = EMQ(learner=learner, exact_train_prev=False, recalib=calib.lower() if calib != 'NoCal' else None)
+
+ # model selection
+ param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
+ model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', n_jobs=-1, refit=False, verbose=True)
+ quantifier = model_selection.fit(training)
+
+ # evaluation
+ report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
+
+ import os
+ os.makedirs(f'./predictions/{task}', exist_ok=True)
+ with open(f'./predictions/{task}/{calib}-EMQ.csv', 'wt') as foo:
+ estim_prev = report['estim-prev'].values
+ nclasses = len(estim_prev[0])
+ foo.write(f'id,'+','.join([str(x) for x in range(nclasses)])+'\n')
+ for id, prev in enumerate(estim_prev):
+ foo.write(f'{id},'+','.join([f'{p:.5f}' for p in prev])+'\n')
+
+ os.makedirs(f'./errors/{task}', exist_ok=True)
+ with open(f'./errors/{task}/{calib}-EMQ.csv', 'wt') as foo:
+ maes, mraes = report['mae'].values, report['mrae'].values
+ foo.write(f'id,AE,RAE\n')
+ for id, (ae_i, rae_i) in enumerate(zip(maes, mraes)):
+ foo.write(f'{id},{ae_i:.5f},{rae_i:.5f}\n')
+
+ # printing results
+ pd.set_option('display.expand_frame_repr', False)
+ report['estim-prev'] = report['estim-prev'].map(F.strprev)
+ print(report)
+
+ print('Averaged values:')
+ print(report.mean())
From f9a199d85976ecc7be24ce00341c72b82ff4cb65 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Fri, 27 Jan 2023 18:13:23 +0100
Subject: [PATCH 51/67] fixing hyperparameters with prefixes, and replacing
learner with classifier in aggregative quantifiers
---
TODO.txt | 6 +
examples/lequa2022_experiments.py | 2 +-
examples/lequa2022_experiments_recalib.py | 42 +--
quapy/CHANGE_LOG.txt | 6 +
quapy/classification/calibration.py | 60 ++--
quapy/method/aggregative.py | 365 +++++++++++-----------
quapy/method/base.py | 51 +--
quapy/method/meta.py | 62 ++--
quapy/method/neural.py | 60 ++--
quapy/model_selection.py | 16 +-
10 files changed, 352 insertions(+), 318 deletions(-)
diff --git a/TODO.txt b/TODO.txt
index 6cef78c..36b7e95 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -6,6 +6,12 @@ merge with master, because I had to fix some problems with QuaNet due to an issu
added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
it parallelized
+check the OneVsAll module(s)
+
+check the set_params de neural.py, because the separation of estimator__ is not implemented; see also
+ __check_params_colision
+
+HDy can be customized so that the number of bins is specified, instead of explored within the fit method
Packaging:
==========================================
diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py
index 31ec651..41bc495 100644
--- a/examples/lequa2022_experiments.py
+++ b/examples/lequa2022_experiments.py
@@ -17,7 +17,7 @@ training, val_generator, test_generator = fetch_lequa2022(task=task)
# define the quantifier
learner = CalibratedClassifierCV(LogisticRegression())
-quantifier = EMQ(learner=learner)
+quantifier = EMQ(classifier=learner)
# model selection
param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
diff --git a/examples/lequa2022_experiments_recalib.py b/examples/lequa2022_experiments_recalib.py
index 983c781..a5a0e05 100644
--- a/examples/lequa2022_experiments_recalib.py
+++ b/examples/lequa2022_experiments_recalib.py
@@ -4,7 +4,7 @@ from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
-from classification.calibration import RecalibratedClassifierBase, NBVSCalibration, \
+from classification.calibration import RecalibratedProbabilisticClassifierBase, NBVSCalibration, \
BCTSCalibration
from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
from evaluation import evaluation_report
@@ -13,7 +13,6 @@ from model_selection import GridSearchQ
import pandas as pd
for task in ['T1A', 'T1B']:
- for calib in ['NoCal', 'TS', 'VS', 'NBVS', 'NBTS']:
# calibration = TempScaling(verbose=False, bias_positions='all')
@@ -24,31 +23,36 @@ for task in ['T1A', 'T1B']:
# learner = BCTSCalibration(LogisticRegression(), n_jobs=-1)
# learner = CalibratedClassifierCV(LogisticRegression())
learner = LogisticRegression()
- quantifier = EMQ(learner=learner, exact_train_prev=False, recalib=calib.lower() if calib != 'NoCal' else None)
+ quantifier = EMQ(classifier=learner)
# model selection
- param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
+ param_grid = {
+ 'classifier__C': np.logspace(-3, 3, 7),
+ 'classifier__class_weight': ['balanced', None],
+ 'recalib': ['platt', 'ts', 'vs', 'nbvs', 'bcts', None],
+ 'exact_train_prev': [False, True]
+ }
model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', n_jobs=-1, refit=False, verbose=True)
quantifier = model_selection.fit(training)
# evaluation
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
- import os
- os.makedirs(f'./predictions/{task}', exist_ok=True)
- with open(f'./predictions/{task}/{calib}-EMQ.csv', 'wt') as foo:
- estim_prev = report['estim-prev'].values
- nclasses = len(estim_prev[0])
- foo.write(f'id,'+','.join([str(x) for x in range(nclasses)])+'\n')
- for id, prev in enumerate(estim_prev):
- foo.write(f'{id},'+','.join([f'{p:.5f}' for p in prev])+'\n')
-
- os.makedirs(f'./errors/{task}', exist_ok=True)
- with open(f'./errors/{task}/{calib}-EMQ.csv', 'wt') as foo:
- maes, mraes = report['mae'].values, report['mrae'].values
- foo.write(f'id,AE,RAE\n')
- for id, (ae_i, rae_i) in enumerate(zip(maes, mraes)):
- foo.write(f'{id},{ae_i:.5f},{rae_i:.5f}\n')
+ # import os
+ # os.makedirs(f'./out', exist_ok=True)
+ # with open(f'./out/EMQ_{calib}_{task}.txt', 'wt') as foo:
+ # estim_prev = report['estim-prev'].values
+ # nclasses = len(estim_prev[0])
+ # foo.write(f'id,'+','.join([str(x) for x in range(nclasses)])+'\n')
+ # for id, prev in enumerate(estim_prev):
+ # foo.write(f'{id},'+','.join([f'{p:.5f}' for p in prev])+'\n')
+ #
+ # #os.makedirs(f'./errors/{task}', exist_ok=True)
+ # with open(f'./out/EMQ_{calib}_{task}_errors.txt', 'wt') as foo:
+ # maes, mraes = report['mae'].values, report['mrae'].values
+ # foo.write(f'id,AE,RAE\n')
+ # for id, (ae_i, rae_i) in enumerate(zip(maes, mraes)):
+ # foo.write(f'{id},{ae_i:.5f},{rae_i:.5f}\n')
# printing results
pd.set_option('display.expand_frame_repr', False)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index 090afc8..c450b41 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -37,6 +37,12 @@
- new dependency "abstention" (to add to the project requirements and setup). Calibration methods from
https://github.com/kundajelab/abstention added.
+- the internal classifier of aggregative methods is now called "classifier" instead of "learner"
+
+- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters
+ should be marked with a "classifier__" prefix (just like in scikit-learn), while the quantifier's specific
+ hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has
+
Things to fix:
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
diff --git a/quapy/classification/calibration.py b/quapy/classification/calibration.py
index 9ea5576..69a7e14 100644
--- a/quapy/classification/calibration.py
+++ b/quapy/classification/calibration.py
@@ -11,27 +11,27 @@ import numpy as np
# see https://github.com/kundajelab/abstention
-class RecalibratedClassifier:
+class RecalibratedProbabilisticClassifier:
pass
-class RecalibratedClassifierBase(BaseEstimator, RecalibratedClassifier):
+class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabilisticClassifier):
"""
Applies a (re)calibration method from abstention.calibration, as defined in
`Alexandari et al. paper `_:
- :param estimator: a scikit-learn probabilistic classifier
+ :param classifier: a scikit-learn probabilistic classifier
:param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory)
- :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
+ :param val_split: indicate an integer k for performing kFCV to obtain the posterior probabilities, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards.
- :param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
+ :param n_jobs: indicate the number of parallel workers (only when val_split is an integer); default=None
:param verbose: whether or not to display information in the standard output
"""
- def __init__(self, estimator, calibrator, val_split=5, n_jobs=1, verbose=False):
- self.estimator = estimator
+ def __init__(self, classifier, calibrator, val_split=5, n_jobs=None, verbose=False):
+ self.classifier = classifier
self.calibrator = calibrator
self.val_split = val_split
self.n_jobs = n_jobs
@@ -50,39 +50,39 @@ class RecalibratedClassifierBase(BaseEstimator, RecalibratedClassifier):
def fit_cv(self, X, y):
posteriors = cross_val_predict(
- self.estimator, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method="predict_proba"
+ self.classifier, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method='predict_proba'
)
- self.estimator.fit(X, y)
+ self.classifier.fit(X, y)
nclasses = len(np.unique(y))
self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True)
return self
def fit_tr_val(self, X, y):
Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y)
- self.estimator.fit(Xtr, ytr)
- posteriors = self.estimator.predict_proba(Xva)
+ self.classifier.fit(Xtr, ytr)
+ posteriors = self.classifier.predict_proba(Xva)
nclasses = len(np.unique(yva))
self.calibrator = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True)
return self
def predict(self, X):
- return self.estimator.predict(X)
+ return self.classifier.predict(X)
def predict_proba(self, X):
- posteriors = self.estimator.predict_proba(X)
+ posteriors = self.classifier.predict_proba(X)
return self.calibration_function(posteriors)
@property
def classes_(self):
- return self.estimator.classes_
+ return self.classifier.classes_
-class NBVSCalibration(RecalibratedClassifierBase):
+class NBVSCalibration(RecalibratedProbabilisticClassifierBase):
"""
Applies the No-Bias Vector Scaling (NBVS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper `_:
- :param estimator: a scikit-learn probabilistic classifier
+ :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@@ -91,20 +91,20 @@ class NBVSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output
"""
- def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
- self.estimator = estimator
+ def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
+ self.classifier = classifier
self.calibrator = NoBiasVectorScaling(verbose=verbose)
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
-class BCTSCalibration(RecalibratedClassifierBase):
+class BCTSCalibration(RecalibratedProbabilisticClassifierBase):
"""
Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper `_:
- :param estimator: a scikit-learn probabilistic classifier
+ :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@@ -113,20 +113,20 @@ class BCTSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output
"""
- def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
- self.estimator = estimator
+ def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
+ self.classifier = classifier
self.calibrator = TempScaling(verbose=verbose, bias_positions='all')
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
-class TSCalibration(RecalibratedClassifierBase):
+class TSCalibration(RecalibratedProbabilisticClassifierBase):
"""
Applies the Temperature Scaling (TS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper `_:
- :param estimator: a scikit-learn probabilistic classifier
+ :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@@ -135,20 +135,20 @@ class TSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output
"""
- def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
- self.estimator = estimator
+ def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
+ self.classifier = classifier
self.calibrator = TempScaling(verbose=verbose)
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
-class VSCalibration(RecalibratedClassifierBase):
+class VSCalibration(RecalibratedProbabilisticClassifierBase):
"""
Applies the Vector Scaling (VS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper `_:
- :param estimator: a scikit-learn probabilistic classifier
+ :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@@ -157,8 +157,8 @@ class VSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output
"""
- def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
- self.estimator = estimator
+ def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
+ self.classifier = classifier
self.calibrator = VectorScaling(verbose=verbose)
self.val_split = val_split
self.n_jobs = n_jobs
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index d77f1ed..3246b9f 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -10,7 +10,7 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
import quapy as qp
import quapy.functional as F
-from classification.calibration import RecalibratedClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
+from classification.calibration import RecalibratedProbabilisticClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
VSCalibration
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
@@ -23,41 +23,41 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier
class AggregativeQuantifier(BaseQuantifier):
"""
Abstract class for quantification methods that base their estimations on the aggregation of classification
- results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`learner` attribute.
- Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the aggregation
- of label predictions. The method :meth:`quantify` comes with a default implementation based on
- :meth:`classify` and :meth:`aggregate`.
+ results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`classifier`
+ attribute. Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the
+ aggregation of label predictions. The method :meth:`quantify` comes with a default implementation based on
+ :meth:`classify` and :meth:`aggregate`.
"""
@abstractmethod
- def fit(self, data: LabelledCollection, fit_learner=True):
+ def fit(self, data: LabelledCollection, fit_classifier=True):
"""
Trains the aggregative quantifier
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
- :param fit_learner: whether or not to train the learner (default is True). Set to False if the
+ :param fit_classifier: whether or not to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier.
:return: self
"""
...
@property
- def learner(self):
+ def classifier(self):
"""
Gives access to the classifier
:return: the classifier (typically an sklearn's Estimator)
"""
- return self.learner_
+ return self.classifier_
- @learner.setter
- def learner(self, classifier):
+ @classifier.setter
+ def classifier(self, classifier):
"""
Setter for the classifier
:param classifier: the classifier
"""
- self.learner_ = classifier
+ self.classifier_ = classifier
def classify(self, instances):
"""
@@ -68,7 +68,7 @@ class AggregativeQuantifier(BaseQuantifier):
:param instances: array-like
:return: np.ndarray of shape `(n_instances,)` with label predictions
"""
- return self.learner.predict(instances)
+ return self.classifier.predict(instances)
def quantify(self, instances):
"""
@@ -91,24 +91,24 @@ class AggregativeQuantifier(BaseQuantifier):
"""
...
- def get_params(self, deep=True):
- """
- Return the current parameters of the quantifier.
+ # def get_params(self, deep=True):
+ # """
+ # Return the current parameters of the quantifier.
+ #
+ # :param deep: for compatibility with sklearn
+ # :return: a dictionary of param-value pairs
+ # """
+ #
+ # return self.learner.get_params()
- :param deep: for compatibility with sklearn
- :return: a dictionary of param-value pairs
- """
-
- return self.learner.get_params()
-
- def set_params(self, **parameters):
- """
- Set the parameters of the quantifier.
-
- :param parameters: dictionary of param-value pairs
- """
-
- self.learner.set_params(**parameters)
+ # def set_params(self, **parameters):
+ # """
+ # Set the parameters of the quantifier.
+ #
+ # :param parameters: dictionary of param-value pairs
+ # """
+ #
+ # self.learner.set_params(**parameters)
@property
def classes_(self):
@@ -118,7 +118,7 @@ class AggregativeQuantifier(BaseQuantifier):
:return: array-like
"""
- return self.learner.classes_
+ return self.classifier.classes_
class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
@@ -130,43 +130,43 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
"""
def classify(self, instances):
- return self.learner.predict_proba(instances)
+ return self.classifier.predict_proba(instances)
- def set_params(self, **parameters):
- if isinstance(self.learner, CalibratedClassifierCV):
- if self.learner.get_params().get('base_estimator') == 'deprecated':
- key_prefix = 'estimator__' # this has changed in the newer versions of sklearn
- else:
- key_prefix = 'base_estimator__'
- parameters = {key_prefix + k: v for k, v in parameters.items()}
- elif isinstance(self.learner, RecalibratedClassifier):
- parameters = {'estimator__' + k: v for k, v in parameters.items()}
-
- self.learner.set_params(**parameters)
- return self
+ # def set_params(self, **parameters):
+ # if isinstance(self.classifier, CalibratedClassifierCV):
+ # if self.classifier.get_params().get('base_estimator') == 'deprecated':
+ # key_prefix = 'estimator__' # this has changed in the newer versions of sklearn
+ # else:
+ # key_prefix = 'base_estimator__'
+ # parameters = {key_prefix + k: v for k, v in parameters.items()}
+ # elif isinstance(self.classifier, RecalibratedClassifier):
+ # parameters = {'estimator__' + k: v for k, v in parameters.items()}
+ #
+ # self.classifier.set_params(**parameters)
+ # return self
# Helper
# ------------------------------------
-def _ensure_probabilistic(learner):
- if not hasattr(learner, 'predict_proba'):
- print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
+def _ensure_probabilistic(classifier):
+ if not hasattr(classifier, 'predict_proba'):
+ print(f'The learner {classifier.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.')
- learner = CalibratedClassifierCV(learner, cv=5)
- return learner
+ classifier = CalibratedClassifierCV(classifier, cv=5)
+ return classifier
-def _training_helper(learner,
+def _training_helper(classifier,
data: LabelledCollection,
- fit_learner: bool = True,
+ fit_classifier: bool = True,
ensure_probabilistic=False,
val_split: Union[LabelledCollection, float] = None):
"""
Training procedure common to all Aggregative Quantifiers.
- :param learner: the learner to be fit
+ :param classifier: the learner to be fit
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
- :param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
+ :param fit_classifier: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param val_split: if specified as a float, indicates the proportion of training instances that will define the
@@ -175,9 +175,9 @@ def _training_helper(learner,
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) to be used as a validation set for any subsequent parameter fitting
"""
- if fit_learner:
+ if fit_classifier:
if ensure_probabilistic:
- learner = _ensure_probabilistic(learner)
+ classifier = _ensure_probabilistic(classifier)
if val_split is not None:
if isinstance(val_split, float):
if not (0 < val_split < 1):
@@ -193,72 +193,72 @@ def _training_helper(learner,
else:
train, unused = data, None
- if isinstance(learner, BaseQuantifier):
- learner.fit(train)
+ if isinstance(classifier, BaseQuantifier):
+ classifier.fit(train)
else:
- learner.fit(*train.Xy)
+ classifier.fit(*train.Xy)
else:
if ensure_probabilistic:
- if not hasattr(learner, 'predict_proba'):
- raise AssertionError('error: the learner cannot be calibrated since fit_learner is set to False')
+ if not hasattr(classifier, 'predict_proba'):
+ raise AssertionError('error: the learner cannot be calibrated since fit_classifier is set to False')
unused = None
if isinstance(val_split, LabelledCollection):
unused = val_split
- return learner, unused
+ return classifier, unused
def cross_generate_predictions(
data,
- learner,
+ classifier,
val_split,
probabilistic,
- fit_learner,
+ fit_classifier,
n_jobs
):
n_jobs = qp.get_njobs(n_jobs)
if isinstance(val_split, int):
- assert fit_learner == True, \
- 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
+ assert fit_classifier == True, \
+ 'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False'
if probabilistic:
- learner = _ensure_probabilistic(learner)
+ classifier = _ensure_probabilistic(classifier)
predict = 'predict_proba'
else:
predict = 'predict'
- y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
+ y_pred = cross_val_predict(classifier, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
class_count = data.counts()
# fit the learner on all data
- learner.fit(*data.Xy)
+ classifier.fit(*data.Xy)
y = data.y
classes = data.classes_
else:
- learner, val_data = _training_helper(
- learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
+ classifier, val_data = _training_helper(
+ classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split
)
- y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances)
+ y_pred = classifier.predict_proba(val_data.instances) if probabilistic else classifier.predict(val_data.instances)
y = val_data.labels
classes = val_data.classes_
class_count = val_data.counts()
- return learner, y, y_pred, classes, class_count
+ return classifier, y, y_pred, classes, class_count
def cross_generate_predictions_depr(
data,
- learner,
+ classifier,
val_split,
probabilistic,
- fit_learner,
+ fit_classifier,
method_name=''
):
- predict = learner.predict_proba if probabilistic else learner.predict
+ predict = classifier.predict_proba if probabilistic else classifier.predict
if isinstance(val_split, int):
- assert fit_learner == True, \
- 'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
+ assert fit_classifier == True, \
+ 'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False'
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
@@ -267,8 +267,8 @@ def cross_generate_predictions_depr(
pbar.set_description(f'{method_name}\tfitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
- learner, val_data = _training_helper(
- learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation
+ classifier, val_data = _training_helper(
+ classifier, training, fit_classifier, ensure_probabilistic=probabilistic, val_split=validation
)
y_.append(predict(val_data.instances))
y.append(val_data.labels)
@@ -278,21 +278,21 @@ def cross_generate_predictions_depr(
class_count = data.counts()
# fit the learner on all data
- learner, _ = _training_helper(
- learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None
+ classifier, _ = _training_helper(
+ classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=None
)
classes = data.classes_
else:
- learner, val_data = _training_helper(
- learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
+ classifier, val_data = _training_helper(
+ classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split
)
y_ = predict(val_data.instances)
y = val_data.labels
classes = val_data.classes_
class_count = val_data.counts()
- return learner, y, y_, classes, class_count
+ return classifier, y, y_, classes, class_count
# Methods
# ------------------------------------
@@ -301,22 +301,22 @@ class CC(AggregativeQuantifier):
The most basic Quantification method. One that simply classifies all instances and counts how many have been
attributed to each of the classes in order to compute class prevalence estimates.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
"""
- def __init__(self, learner: BaseEstimator):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator):
+ self.classifier = classifier
- def fit(self, data: LabelledCollection, fit_learner=True):
+ def fit(self, data: LabelledCollection, fit_classifier=True):
"""
- Trains the Classify & Count method unless `fit_learner` is False, in which case, the classifier is assumed to
+ Trains the Classify & Count method unless `fit_classifier` is False, in which case, the classifier is assumed to
be already fit and there is nothing else to do.
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
- :param fit_learner: if False, the classifier is assumed to be fit
+ :param fit_classifier: if False, the classifier is assumed to be fit
:return: self
"""
- self.learner, _ = _training_helper(self.learner, data, fit_learner)
+ self.classifier, _ = _training_helper(self.classifier, data, fit_classifier)
return self
def aggregate(self, classif_predictions: np.ndarray):
@@ -335,7 +335,7 @@ class ACC(AggregativeQuantifier):
the "adjusted" variant of :class:`CC`, that corrects the predictions of CC
according to the `misclassification rates`.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -344,17 +344,17 @@ class ACC(AggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None):
+ self.classifier = classifier
self.val_split = val_split
self.n_jobs = qp.get_njobs(n_jobs)
- def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
"""
Trains a ACC quantifier.
:param data: the training set
- :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
+ :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
@@ -365,11 +365,11 @@ class ACC(AggregativeQuantifier):
if val_split is None:
val_split = self.val_split
- self.learner, y, y_, classes, class_count = cross_generate_predictions(
- data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs
+ self.classifier, y, y_, classes, class_count = cross_generate_predictions(
+ data, self.classifier, val_split, probabilistic=False, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
- self.cc = CC(self.learner)
+ self.cc = CC(self.classifier)
self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
return self
@@ -422,14 +422,14 @@ class PCC(AggregativeProbabilisticQuantifier):
`Probabilistic Classify & Count `_,
the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
"""
- def __init__(self, learner: BaseEstimator):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator):
+ self.classifier = classifier
- def fit(self, data: LabelledCollection, fit_learner=True):
- self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
+ def fit(self, data: LabelledCollection, fit_classifier=True):
+ self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
return self
def aggregate(self, classif_posteriors):
@@ -441,7 +441,7 @@ class PACC(AggregativeProbabilisticQuantifier):
`Probabilistic Adjusted Classify & Count `_,
the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -451,17 +451,17 @@ class PACC(AggregativeProbabilisticQuantifier):
:param n_jobs: number of parallel workers
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None):
+ self.classifier = classifier
self.val_split = val_split
self.n_jobs = qp.get_njobs(n_jobs)
- def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
"""
Trains a PACC quantifier.
:param data: the training set
- :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
+ :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
@@ -472,11 +472,11 @@ class PACC(AggregativeProbabilisticQuantifier):
if val_split is None:
val_split = self.val_split
- self.learner, y, y_, classes, class_count = cross_generate_predictions(
- data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
+ self.classifier, y, y_, classes, class_count = cross_generate_predictions(
+ data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
- self.pcc = PCC(self.learner)
+ self.pcc = PCC(self.classifier)
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self
@@ -510,7 +510,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
maximum-likelihood estimation, in a mutually recursive way, until convergence.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
value of the posterior probabilities of the training instances as suggested in
@@ -523,30 +523,32 @@ class EMQ(AggregativeProbabilisticQuantifier):
MAX_ITER = 1000
EPSILON = 1e-4
- def __init__(self, learner: BaseEstimator, exact_train_prev=True, recalib=None):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
+ self.classifier = classifier
self.exact_train_prev = exact_train_prev
self.recalib = recalib
- def fit(self, data: LabelledCollection, fit_learner=True):
+ def fit(self, data: LabelledCollection, fit_classifier=True):
if self.recalib is not None:
if self.recalib == 'nbvs':
- self.learner = NBVSCalibration(self.learner)
+ self.classifier = NBVSCalibration(self.classifier)
elif self.recalib == 'bcts':
- self.learner = BCTSCalibration(self.learner)
+ self.classifier = BCTSCalibration(self.classifier)
elif self.recalib == 'ts':
- self.learner = TSCalibration(self.learner)
+ self.classifier = TSCalibration(self.classifier)
elif self.recalib == 'vs':
- self.learner = VSCalibration(self.learner)
+ self.classifier = VSCalibration(self.classifier)
+ elif self.recalib == 'platt':
+ self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
- self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
+ self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else:
self.train_prevalence = qp.model_selection.cross_val_predict(
- quantifier=PCC(deepcopy(self.learner)),
+ quantifier=PCC(deepcopy(self.classifier)),
data=data,
nfolds=3,
random_state=0
@@ -558,7 +560,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
return priors
def predict_proba(self, instances, epsilon=EPSILON):
- classif_posteriors = self.learner.predict_proba(instances)
+ classif_posteriors = self.classifier.predict_proba(instances)
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
return posteriors
@@ -611,21 +613,21 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
class-conditional distributions of the posterior probabilities returned for the positive and negative validation
examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values.
- :param learner: a sklearn's Estimator that generates a binary classifier
+ :param classifier: a sklearn's Estimator that generates a binary classifier
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ self.classifier = classifier
self.val_split = val_split
- def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
Trains a HDy quantifier.
:param data: the training set
- :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
+ :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a
:class:`quapy.data.base.LabelledCollection` indicating the validation set itself
@@ -635,11 +637,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
val_split = self.val_split
self._check_binary(data, self.__class__.__name__)
- self.learner, validation = _training_helper(
- self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+ self.classifier, validation = _training_helper(
+ self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
- self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
- self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
+ self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
+ self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
# pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
@@ -684,7 +686,7 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
minimizes the distance between distributions.
Details for the ternary search have been got from
- :param learner: a sklearn's Estimator that generates a binary classifier
+ :param classifier: a sklearn's Estimator that generates a binary classifier
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
:param n_bins: an int with the number of bins to use to compute the histograms.
@@ -693,8 +695,8 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
:param tol: a float with the tolerance for the ternary search algorithm.
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
+ self.classifier = classifier
self.val_split = val_split
self.tol = tol
self.distance = distance
@@ -717,23 +719,23 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return (left + right) / 2
def _compute_distance(self, Px_train, Px_test, distance: Union[str, Callable]='HD'):
- if distance=='HD':
+ if distance == 'HD':
return F.HellingerDistance(Px_train, Px_test)
- elif distance=='topsoe':
+ elif distance == 'topsoe':
return F.TopsoeDistance(Px_train, Px_test)
else:
return distance(Px_train, Px_test)
- def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self._check_binary(data, self.__class__.__name__)
- self.learner, validation = _training_helper(
- self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+ self.classifier, validation = _training_helper(
+ self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
- self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
- self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
+ self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
+ self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0]
self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0]
return self
@@ -757,25 +759,25 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
SMM is a simplification of matching distribution methods where the representation of the examples
is created using the mean instead of a histogram.
- :param learner: a sklearn's Estimator that generates a binary classifier.
+ :param classifier: a sklearn's Estimator that generates a binary classifier.
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ self.classifier = classifier
self.val_split = val_split
- def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self._check_binary(data, self.__class__.__name__)
- self.learner, validation = _training_helper(
- self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+ self.classifier, validation = _training_helper(
+ self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
- self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
- self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
+ self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
+ self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
self.Pxy1_mean = np.mean(self.Pxy1)
self.Pxy0_mean = np.mean(self.Pxy0)
return self
@@ -809,19 +811,19 @@ class ELM(AggregativeQuantifier, BinaryQuantifier):
self.svmperf_base = svmperf_base if svmperf_base is not None else qp.environ['SVMPERF_HOME']
self.loss = loss
self.kwargs = kwargs
- self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs)
+ self.classifier = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs)
- def fit(self, data: LabelledCollection, fit_learner=True):
+ def fit(self, data: LabelledCollection, fit_classifier=True):
self._check_binary(data, self.__class__.__name__)
- assert fit_learner, 'the method requires that fit_learner=True'
- self.learner.fit(data.instances, data.labels)
+ assert fit_classifier, 'the method requires that fit_classifier=True'
+ self.classifier.fit(data.instances, data.labels)
return self
def aggregate(self, classif_predictions: np.ndarray):
return F.prevalence_from_labels(classif_predictions, self.classes_)
def classify(self, X, y=None):
- return self.learner.predict(X)
+ return self.classifier.predict(X)
class SVMQ(ELM):
@@ -916,7 +918,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
that would allow for more true positives and many more false positives, on the grounds this
would deliver larger denominators.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -925,22 +927,22 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
- self.learner = learner
+ def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None):
+ self.classifier = classifier
self.val_split = val_split
self.n_jobs = qp.get_njobs(n_jobs)
- def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
self._check_binary(data, "Threshold Optimization")
if val_split is None:
val_split = self.val_split
- self.learner, y, y_, classes, class_count = cross_generate_predictions(
- data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
+ self.classifier, y, y_, classes, class_count = cross_generate_predictions(
+ data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
- self.cc = CC(self.learner)
+ self.cc = CC(self.classifier)
self.tpr, self.fpr = self._optimize_threshold(y, y_)
@@ -1018,7 +1020,7 @@ class T50(ThresholdOptimization):
for the threshold that makes `tpr` cosest to 0.5.
The goal is to bring improved stability to the denominator of the adjustment.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -1027,8 +1029,8 @@ class T50(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- super().__init__(learner, val_split)
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float:
return abs(tpr - 0.5)
@@ -1042,7 +1044,7 @@ class MAX(ThresholdOptimization):
for the threshold that maximizes `tpr-fpr`.
The goal is to bring improved stability to the denominator of the adjustment.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -1051,8 +1053,8 @@ class MAX(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- super().__init__(learner, val_split)
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float:
# MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)
@@ -1067,7 +1069,7 @@ class X(ThresholdOptimization):
for the threshold that yields `tpr=1-fpr`.
The goal is to bring improved stability to the denominator of the adjustment.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -1076,8 +1078,8 @@ class X(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- super().__init__(learner, val_split)
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float:
return abs(1 - (tpr + fpr))
@@ -1091,7 +1093,7 @@ class MS(ThresholdOptimization):
class prevalence estimates for all decision thresholds and returns the median of them all.
The goal is to bring improved stability to the denominator of the adjustment.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -1099,8 +1101,8 @@ class MS(ThresholdOptimization):
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- super().__init__(learner, val_split)
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float:
pass
@@ -1128,7 +1130,7 @@ class MS2(MS):
which `tpr-fpr>0.25`
The goal is to bring improved stability to the denominator of the adjustment.
- :param learner: a sklearn's Estimator that generates a classifier
+ :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@@ -1136,8 +1138,8 @@ class MS2(MS):
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
- def __init__(self, learner: BaseEstimator, val_split=0.4):
- super().__init__(learner, val_split)
+ def __init__(self, classifier: BaseEstimator, val_split=0.4):
+ super().__init__(classifier, val_split)
def _optimize_threshold(self, y, probabilities):
tprs = [0, 1]
@@ -1174,7 +1176,8 @@ class OneVsAll(AggregativeQuantifier):
This variant was used, along with the :class:`EMQ` quantifier, in
`Gao and Sebastiani, 2016 `_.
- :param learner: a sklearn's Estimator that generates a binary classifier
+ :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a
+ one-vs-all manner
:param n_jobs: number of parallel workers
"""
@@ -1186,11 +1189,11 @@ class OneVsAll(AggregativeQuantifier):
self.binary_quantifier = binary_quantifier
self.n_jobs = qp.get_njobs(n_jobs)
- def fit(self, data: LabelledCollection, fit_learner=True):
+ def fit(self, data: LabelledCollection, fit_classifier=True):
assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data'
- assert fit_learner == True, \
- 'fit_learner must be True'
+ assert fit_classifier == True, \
+ 'fit_classifier must be True'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data)
diff --git a/quapy/method/base.py b/quapy/method/base.py
index c935735..459130c 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -1,12 +1,15 @@
from abc import ABCMeta, abstractmethod
from copy import deepcopy
+
+from sklearn.base import BaseEstimator
+
import quapy as qp
from quapy.data import LabelledCollection
# Base Quantifier abstract class
# ------------------------------------
-class BaseQuantifier(metaclass=ABCMeta):
+class BaseQuantifier(BaseEstimator):
"""
Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on
:class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and
@@ -33,24 +36,24 @@ class BaseQuantifier(metaclass=ABCMeta):
"""
...
- @abstractmethod
- def set_params(self, **parameters):
- """
- Set the parameters of the quantifier.
-
- :param parameters: dictionary of param-value pairs
- """
- ...
-
- @abstractmethod
- def get_params(self, deep=True):
- """
- Return the current parameters of the quantifier.
-
- :param deep: for compatibility with sklearn
- :return: a dictionary of param-value pairs
- """
- ...
+ # @abstractmethod
+ # def set_params(self, **parameters):
+ # """
+ # Set the parameters of the quantifier.
+ #
+ # :param parameters: dictionary of param-value pairs
+ # """
+ # ...
+ #
+ # @abstractmethod
+ # def get_params(self, deep=True):
+ # """
+ # Return the current parameters of the quantifier.
+ #
+ # :param deep: for compatibility with sklearn
+ # :return: a dictionary of param-value pairs
+ # """
+ # ...
class BinaryQuantifier(BaseQuantifier):
@@ -67,7 +70,7 @@ class BinaryQuantifier(BaseQuantifier):
class OneVsAllGeneric:
"""
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
- quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
+ quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
"""
def __init__(self, binary_quantifier, n_jobs=None):
@@ -103,11 +106,11 @@ class OneVsAllGeneric:
def get_params(self, deep=True):
return self.binary_quantifier.get_params()
- def _delayed_binary_predict(self, c, learners, X):
- return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
+ def _delayed_binary_predict(self, c, quantifiers, X):
+ return quantifiers[c].quantify(X)[:, 1] # the mean is the estimation for the positive class prevalence
- def _delayed_binary_fit(self, c, learners, data, **kwargs):
+ def _delayed_binary_fit(self, c, quantifiers, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
- learners[c].fit(bindata, **kwargs)
+ quantifiers[c].fit(bindata, **kwargs)
diff --git a/quapy/method/meta.py b/quapy/method/meta.py
index 5e084e5..82d3a35 100644
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@@ -146,7 +146,7 @@ class Ensemble(BaseQuantifier):
This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility
with the abstract class).
Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or
- `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a learner `l` optimized for
+ `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for
classification (not recommended).
:param parameters: dictionary
@@ -154,7 +154,7 @@ class Ensemble(BaseQuantifier):
"""
raise NotImplementedError(f'{self.__class__.__name__} should not be used within GridSearchQ; '
f'instead, use Ensemble(GridSearchQ(q),...), with q a Quantifier (recommended), '
- f'or Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a learner '
+ f'or Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a classifier '
f'l optimized for classification (not recommended).')
def get_params(self, deep=True):
@@ -162,7 +162,7 @@ class Ensemble(BaseQuantifier):
This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility
with the abstract class).
Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or
- `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a learner `l` optimized for
+ `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for
classification (not recommended).
:return: raises an Exception
@@ -326,18 +326,18 @@ def _draw_simplex(ndim, min_val, max_trials=100):
f'>= {min_val} is unlikely (it failed after {max_trials} trials)')
-def _instantiate_ensemble(learner, base_quantifier_class, param_grid, optim, param_model_sel, **kwargs):
+def _instantiate_ensemble(classifier, base_quantifier_class, param_grid, optim, param_model_sel, **kwargs):
if optim is None:
- base_quantifier = base_quantifier_class(learner)
+ base_quantifier = base_quantifier_class(classifier)
elif optim in qp.error.CLASSIFICATION_ERROR:
if optim == qp.error.f1e:
scoring = make_scorer(f1_score)
elif optim == qp.error.acce:
scoring = make_scorer(accuracy_score)
- learner = GridSearchCV(learner, param_grid, scoring=scoring)
- base_quantifier = base_quantifier_class(learner)
+ classifier = GridSearchCV(classifier, param_grid, scoring=scoring)
+ base_quantifier = base_quantifier_class(classifier)
else:
- base_quantifier = GridSearchQ(base_quantifier_class(learner),
+ base_quantifier = GridSearchQ(base_quantifier_class(classifier),
param_grid=param_grid,
**param_model_sel,
error=optim)
@@ -357,7 +357,7 @@ def _check_error(error):
f'the name of an error function in {qp.error.ERROR_NAMES}')
-def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
+def ensembleFactory(classifier, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
**kwargs):
"""
Ensemble factory. Provides a unified interface for instantiating ensembles that can be optimized (via model
@@ -390,7 +390,7 @@ def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None,
>>>
>>> ensembleFactory(LogisticRegression(), PACC, optim='mae', policy='mae', **common)
- :param learner: sklearn's Estimator that generates a classifier
+ :param classifier: sklearn's Estimator that generates a classifier
:param base_quantifier_class: a class of quantifiers
:param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it
@@ -405,21 +405,21 @@ def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None,
if param_model_sel is None:
raise ValueError(f'param_model_sel is None but optim was requested.')
error = _check_error(optim)
- return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
+ return _instantiate_ensemble(classifier, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
-def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
+def ECC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
"""
Implements an ensemble of :class:`quapy.method.aggregative.CC` quantifiers, as used by
`Pérez-Gállego et al., 2019 `_.
Equivalent to:
- >>> ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, **kwargs)
+ >>> ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details.
- :param learner: sklearn's Estimator that generates a classifier
+ :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to
@@ -428,21 +428,21 @@ def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble`
"""
- return ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, **kwargs)
+ return ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
-def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
+def EACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
"""
Implements an ensemble of :class:`quapy.method.aggregative.ACC` quantifiers, as used by
`Pérez-Gállego et al., 2019 `_.
Equivalent to:
- >>> ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, **kwargs)
+ >>> ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details.
- :param learner: sklearn's Estimator that generates a classifier
+ :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to
@@ -451,20 +451,20 @@ def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble`
"""
- return ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, **kwargs)
+ return ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
-def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
+def EPACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
"""
Implements an ensemble of :class:`quapy.method.aggregative.PACC` quantifiers.
Equivalent to:
- >>> ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, **kwargs)
+ >>> ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details.
- :param learner: sklearn's Estimator that generates a classifier
+ :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to
@@ -473,21 +473,21 @@ def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble`
"""
- return ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, **kwargs)
+ return ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
-def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
+def EHDy(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
"""
Implements an ensemble of :class:`quapy.method.aggregative.HDy` quantifiers, as used by
`Pérez-Gállego et al., 2019 `_.
Equivalent to:
- >>> ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, **kwargs)
+ >>> ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details.
- :param learner: sklearn's Estimator that generates a classifier
+ :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to
@@ -496,20 +496,20 @@ def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble`
"""
- return ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, **kwargs)
+ return ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
-def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
+def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
"""
Implements an ensemble of :class:`quapy.method.aggregative.EMQ` quantifiers.
Equivalent to:
- >>> ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
+ >>> ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details.
- :param learner: sklearn's Estimator that generates a classifier
+ :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to
@@ -518,4 +518,4 @@ def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble`
"""
- return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
+ return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
diff --git a/quapy/method/neural.py b/quapy/method/neural.py
index 0665634..1871ff0 100644
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@@ -31,14 +31,14 @@ class QuaNetTrainer(BaseQuantifier):
>>>
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
- >>> learner = NeuralClassifierTrainer(cnn, device='cuda')
+ >>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
>>>
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
- >>> model = QuaNet(learner, qp.environ['SAMPLE_SIZE'], device='cuda')
+ >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
>>> model.fit(dataset.training)
>>> estim_prevalence = model.quantify(dataset.test.instances)
- :param learner: an object implementing `fit` (i.e., that can be trained on labelled data),
+ :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
`predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and
`transform` (i.e., that can generate embedded representations of the unlabelled instances).
:param sample_size: integer, the sample size
@@ -60,7 +60,7 @@ class QuaNetTrainer(BaseQuantifier):
"""
def __init__(self,
- learner,
+ classifier,
sample_size,
n_epochs=100,
tr_iter_per_poch=500,
@@ -76,13 +76,13 @@ class QuaNetTrainer(BaseQuantifier):
checkpointname=None,
device='cuda'):
- assert hasattr(learner, 'transform'), \
- f'the learner {learner.__class__.__name__} does not seem to be able to produce document embeddings ' \
+ assert hasattr(classifier, 'transform'), \
+ f'the classifier {classifier.__class__.__name__} does not seem to be able to produce document embeddings ' \
f'since it does not implement the method "transform"'
- assert hasattr(learner, 'predict_proba'), \
- f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
+ assert hasattr(classifier, 'predict_proba'), \
+ f'the classifier {classifier.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
f'since it does not implement the method "predict_proba"'
- self.learner = learner
+ self.classifier = classifier
self.sample_size = sample_size
self.n_epochs = n_epochs
self.tr_iter = tr_iter_per_poch
@@ -105,26 +105,26 @@ class QuaNetTrainer(BaseQuantifier):
self.checkpoint = os.path.join(checkpointdir, checkpointname)
self.device = torch.device(device)
- self.__check_params_colision(self.quanet_params, self.learner.get_params())
+ self.__check_params_colision(self.quanet_params, self.classifier.get_params())
self._classes_ = None
- def fit(self, data: LabelledCollection, fit_learner=True):
+ def fit(self, data: LabelledCollection, fit_classifier=True):
"""
Trains QuaNet.
- :param data: the training data on which to train QuaNet. If `fit_learner=True`, the data will be split in
+ :param data: the training data on which to train QuaNet. If `fit_classifier=True`, the data will be split in
40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If
- `fit_learner=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively.
- :param fit_learner: if True, trains the classifier on a split containing 40% of the data
+ `fit_classifier=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively.
+ :param fit_classifier: if True, trains the classifier on a split containing 40% of the data
:return: self
"""
self._classes_ = data.classes_
os.makedirs(self.checkpointdir, exist_ok=True)
- if fit_learner:
+ if fit_classifier:
classifier_data, unused_data = data.split_stratified(0.4)
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
- self.learner.fit(*classifier_data.Xy)
+ self.classifier.fit(*classifier_data.Xy)
else:
classifier_data = None
train_data, valid_data = data.split_stratified(0.66)
@@ -133,21 +133,21 @@ class QuaNetTrainer(BaseQuantifier):
self.tr_prev = data.prevalence()
# compute the posterior probabilities of the instances
- valid_posteriors = self.learner.predict_proba(valid_data.instances)
- train_posteriors = self.learner.predict_proba(train_data.instances)
+ valid_posteriors = self.classifier.predict_proba(valid_data.instances)
+ train_posteriors = self.classifier.predict_proba(train_data.instances)
# turn instances' original representations into embeddings
- valid_data_embed = LabelledCollection(self.learner.transform(valid_data.instances), valid_data.labels, self._classes_)
- train_data_embed = LabelledCollection(self.learner.transform(train_data.instances), train_data.labels, self._classes_)
+ valid_data_embed = LabelledCollection(self.classifier.transform(valid_data.instances), valid_data.labels, self._classes_)
+ train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = {
- 'cc': CC(self.learner).fit(None, fit_learner=False),
- 'acc': ACC(self.learner).fit(None, fit_learner=False, val_split=valid_data),
- 'pcc': PCC(self.learner).fit(None, fit_learner=False),
- 'pacc': PACC(self.learner).fit(None, fit_learner=False, val_split=valid_data),
+ 'cc': CC(self.classifier).fit(None, fit_classifier=False),
+ 'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
+ 'pcc': PCC(self.classifier).fit(None, fit_classifier=False),
+ 'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
}
if classifier_data is not None:
- self.quantifiers['emq'] = EMQ(self.learner).fit(classifier_data, fit_learner=False)
+ self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False)
self.status = {
'tr-loss': -1,
@@ -199,8 +199,8 @@ class QuaNetTrainer(BaseQuantifier):
return prevs_estim
def quantify(self, instances):
- posteriors = self.learner.predict_proba(instances)
- embeddings = self.learner.transform(instances)
+ posteriors = self.classifier.predict_proba(instances)
+ embeddings = self.classifier.transform(instances)
quant_estims = self._get_aggregative_estims(posteriors)
self.quanet.eval()
with torch.no_grad():
@@ -264,7 +264,7 @@ class QuaNetTrainer(BaseQuantifier):
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
def get_params(self, deep=True):
- return {**self.learner.get_params(), **self.quanet_params}
+ return {**self.classifier.get_params(), **self.quanet_params}
def set_params(self, **parameters):
learner_params = {}
@@ -273,7 +273,7 @@ class QuaNetTrainer(BaseQuantifier):
self.quanet_params[key] = val
else:
learner_params[key] = val
- self.learner.set_params(**learner_params)
+ self.classifier.set_params(**learner_params)
def __check_params_colision(self, quanet_params, learner_params):
quanet_keys = set(quanet_params.keys())
@@ -281,7 +281,7 @@ class QuaNetTrainer(BaseQuantifier):
intersection = quanet_keys.intersection(learner_keys)
if len(intersection) > 0:
raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to '
- f'the parameters of QuaNet or the learner {self.learner.__class__.__name__}')
+ f'the parameters of QuaNet or the learner {self.classifier.__class__.__name__}')
def clean_checkpoint(self):
"""
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index f7c5b94..3cb22c7 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -88,7 +88,12 @@ class GridSearchQ(BaseQuantifier):
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
#pass a seed to parallel so it is set in clild processes
- scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), seed=qp.environ.get('_R_SEED', None), n_jobs=self.n_jobs)
+ scores = qp.util.parallel(
+ self._delayed_eval,
+ ((params, training) for params in hyper),
+ seed=qp.environ.get('_R_SEED', None),
+ n_jobs=self.n_jobs
+ )
for params, score, model in scores:
if score is not None:
@@ -103,7 +108,7 @@ class GridSearchQ(BaseQuantifier):
tend = time()-tinit
if self.best_score_ is None:
- raise TimeoutError('all jobs took more than the timeout time to end')
+ raise TimeoutError('no combination of hyperparameters seem to work')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
f'[took {tend:.4f}s]')
@@ -150,6 +155,13 @@ class GridSearchQ(BaseQuantifier):
except TimeoutError:
self._sout(f'timeout ({self.timeout}s) reached for config {params}')
score = None
+ except ValueError as e:
+ self._sout(f'the combination of hyperparameters {params} is invalid')
+ raise e
+ except Exception as e:
+ self._sout(f'something went wrong for config {params}; skipping:')
+ self._sout(f'\tException: {e}')
+ score = None
return params, score, model
From ceb88792c5669dea84de244783570c1e37f627d6 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Tue, 31 Jan 2023 15:08:58 +0100
Subject: [PATCH 52/67] added DistributionMatching method, a generic model for
distribution matching for multiclass quantification problems that takes the
divergence and number of bins as hyperparameters
---
quapy/CHANGE_LOG.txt | 16 +++-
quapy/method/aggregative.py | 158 ++++++++++++++++++++++++++++--------
2 files changed, 136 insertions(+), 38 deletions(-)
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index c450b41..f2deea0 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -25,9 +25,9 @@
- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a
test protocol maybe, or None for bypassing it?
-- I think Pablo added DyS, Topsoe distance and binary search.
+- DyS, Topsoe distance and binary search (thanks to Pablo González)
-- I think Pablo added multi-thread reproducibility.
+- Multi-thread reproducibility via seeding (thanks to Pablo González)
- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes
@@ -40,8 +40,16 @@
- the internal classifier of aggregative methods is now called "classifier" instead of "learner"
- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters
- should be marked with a "classifier__" prefix (just like in scikit-learn), while the quantifier's specific
- hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has
+ should be marked with a "classifier__" prefix (just like in scikit-learn with estimators), while the quantifier's
+ specific hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has hyperparameters
+ "classifier__C", "classifier__class_weight", etc., instead of "C" and "class_weight" as in v0.1.6.
+
+- hyperparameters yielding to inconsistent runs raise a ValueError exception, while hyperparameter combinations
+ yielding to internal errors of surrogate functions are reported and skipped, without stopping the grid search.
+
+- DistributionMatching methods added. This is a general framework for distribution matching methods that catters for
+ multiclass quantification. That is to say, one could get a multiclass variant of the (originally binary) HDy
+ method aligned with the Firat's formulation.
Things to fix:
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 3246b9f..87b682e 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -3,6 +3,7 @@ from copy import deepcopy
from typing import Callable, Union
import numpy as np
from joblib import Parallel, delayed
+from scipy import optimize
from sklearn.base import BaseEstimator, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
@@ -10,8 +11,7 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
import quapy as qp
import quapy.functional as F
-from classification.calibration import RecalibratedProbabilisticClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
- VSCalibration
+from classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
@@ -91,25 +91,6 @@ class AggregativeQuantifier(BaseQuantifier):
"""
...
- # def get_params(self, deep=True):
- # """
- # Return the current parameters of the quantifier.
- #
- # :param deep: for compatibility with sklearn
- # :return: a dictionary of param-value pairs
- # """
- #
- # return self.learner.get_params()
-
- # def set_params(self, **parameters):
- # """
- # Set the parameters of the quantifier.
- #
- # :param parameters: dictionary of param-value pairs
- # """
- #
- # self.learner.set_params(**parameters)
-
@property
def classes_(self):
"""
@@ -690,16 +671,16 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
:param n_bins: an int with the number of bins to use to compute the histograms.
- :param distance: an str with a distance already included in the librar (HD or topsoe), of a function
- that computes the distance between two distributions.
+ :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
+ callable function computes the divergence between two distributions (two equally sized arrays).
:param tol: a float with the tolerance for the ternary search algorithm.
"""
- def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
+ def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05):
self.classifier = classifier
self.val_split = val_split
self.tol = tol
- self.distance = distance
+ self.divergence = divergence
self.n_bins = n_bins
def _ternary_search(self, f, left, right, tol):
@@ -718,14 +699,6 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
# Left and right are the current bounds; the maximum is between them
return (left + right) / 2
- def _compute_distance(self, Px_train, Px_test, distance: Union[str, Callable]='HD'):
- if distance == 'HD':
- return F.HellingerDistance(Px_train, Px_test)
- elif distance == 'topsoe':
- return F.TopsoeDistance(Px_train, Px_test)
- else:
- return distance(Px_train, Px_test)
-
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
@@ -744,10 +717,11 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
+ divergence = _get_divergence(self.divergence)
def distribution_distance(prev):
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
- return self._compute_distance(Px_train,Px_test,self.distance)
+ return divergence(Px_train, Px_test)
class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
return np.asarray([1 - class1_prev, class1_prev])
@@ -791,6 +765,122 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
+def _get_divergence(divergence: Union[str, Callable]):
+ if isinstance(divergence, str):
+ if divergence=='HD':
+ return F.HellingerDistance
+ elif divergence=='topsoe':
+ return F.TopsoeDistance
+ else:
+ raise ValueError(f'unknown divergence {divergence}')
+ elif callable(divergence):
+ return divergence
+ else:
+ raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
+
+class DistributionMatching(AggregativeProbabilisticQuantifier):
+ """
+ Generic Distribution Matching quantifier for binary or multiclass quantification.
+ This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
+
+ :param classifier: a sklearn's Estimator that generates a probabilistic classifier
+ :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
+ validation distribution.
+ This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
+ validation data, or as an integer, indicating that the validation distribution should be estimated via
+ `k`-fold cross validation (this integer stands for the number of folds `k`), or as a
+ :class:`quapy.data.base.LabelledCollection` (the split itself).
+ :param nbins: number of bins used to discretize the distributions (default 8)
+ :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
+ or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
+ Distance)
+ :param cdf: whether or not to use CDF instead of PDF (default False)
+ :param n_jobs: number of parallel workers (default None)
+ """
+
+ def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
+
+ self.classifier = classifier
+ self.val_split = val_split
+ self.nbins = nbins
+ self.divergence = divergence
+ self.cdf = cdf
+ self.n_jobs = n_jobs
+
+ def __get_distributions(self, posteriors):
+ histograms = []
+ post_dims = posteriors.shape[1]
+ if post_dims == 2:
+ # in binary quantification we can use only one class, since the other one is its complement
+ post_dims = 1
+ for dim in range(post_dims):
+ hist = np.histogram(posteriors[:, dim], bins=self.nbins, range=(0, 1))[0]
+ histograms.append(hist)
+
+ counts = np.vstack(histograms)
+ distributions = counts/counts.sum(axis=1)[:,np.newaxis]
+ if self.cdf:
+ distributions = np.cumsum(distributions, axis=1)
+ return distributions
+
+ def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
+ """
+ Trains the classifier (if requested) and generates the validation distributions out of the training data.
+ The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
+ channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
+ are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
+ distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
+ is the fraction of instances with a value in the `k`-th bin.
+
+ :param data: the training set
+ :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
+ :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
+ validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
+ indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+ to estimate the parameters
+ """
+ if val_split is None:
+ val_split = self.val_split
+
+ self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
+ data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
+ )
+
+ self.validation_distribution = np.asarray(
+ [self.__get_distributions(posteriors[y==cat]) for cat in range(data.n_classes)]
+ )
+
+ return self
+
+ def aggregate(self, posteriors: np.ndarray):
+ """
+ Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
+ (the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
+ In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
+ `n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
+ independently. The matching is computed as an average of the divergence across all channels.
+
+ :param instances: instances in the sample
+ :return: a vector of class prevalence estimates
+ """
+ test_distribution = self.__get_distributions(posteriors)
+ divergence = _get_divergence(self.divergence)
+ n_classes, n_channels, nbins = self.validation_distribution.shape
+ def match(prev):
+ prev = np.expand_dims(prev, axis=0)
+ mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
+ divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
+ return np.mean(divs)
+
+ # the initial point is set as the uniform distribution
+ uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+
+ # solutions are bounded to those contained in the unit-simplex
+ bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
+ constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
+ r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+ return r.x
+
class ELM(AggregativeQuantifier, BinaryQuantifier):
"""
From 2485117f05d2bca08f187ab2cb0b46961d4b1f2c Mon Sep 17 00:00:00 2001
From: Alejandro Moreo
Date: Wed, 8 Feb 2023 19:06:53 +0100
Subject: [PATCH 53/67] adding documentation and adding one new example
---
docs/build/html/Datasets.html | 555 +-
docs/build/html/Evaluation.html | 57 +-
docs/build/html/Installation.html | 57 +-
docs/build/html/Methods.html | 99 +-
docs/build/html/Model-Selection.html | 65 +-
docs/build/html/Plotting.html | 71 +-
docs/build/html/_sources/index.rst.txt | 2 +-
.../_sources/quapy.classification.rst.txt | 23 +-
docs/build/html/_sources/quapy.data.rst.txt | 18 +-
docs/build/html/_sources/quapy.method.rst.txt | 22 +-
docs/build/html/_sources/quapy.rst.txt | 58 +-
docs/build/html/_sources/quapy.tests.rst.txt | 37 -
docs/build/html/_sources/readme.rst.txt | 7 -
docs/build/html/_sources/readme2.md.txt | 1 -
docs/build/html/_static/alabaster.css | 701 -
docs/build/html/_static/basic.css | 62 +-
docs/build/html/_static/bizstyle.css | 2 +
docs/build/html/_static/bizstyle.js | 43 +-
docs/build/html/_static/custom.css | 1 -
docs/build/html/_static/doctools.js | 373 +-
.../html/_static/documentation_options.js | 8 +-
docs/build/html/_static/jquery-3.5.1.js | 10872 ----------------
docs/build/html/_static/jquery.js | 4 +-
docs/build/html/_static/language_data.js | 102 +-
docs/build/html/_static/searchtools.js | 808 +-
docs/build/html/genindex.html | 319 +-
docs/build/html/index.html | 45 +-
docs/build/html/modules.html | 78 +-
docs/build/html/objects.inv | Bin 2591 -> 2873 bytes
docs/build/html/py-modindex.html | 24 +-
docs/build/html/quapy.classification.html | 448 +-
docs/build/html/quapy.data.html | 482 +-
docs/build/html/quapy.html | 1076 +-
docs/build/html/quapy.method.html | 1308 +-
docs/build/html/quapy.tests.html | 135 -
docs/build/html/readme.html | 129 -
docs/build/html/readme2.html | 92 -
docs/build/html/search.html | 12 +-
docs/build/html/searchindex.js | 2 +-
examples/custom_quantifier.py | 69 +
quapy/CHANGE_LOG.txt | 78 +-
quapy/__init__.py | 21 +-
quapy/classification/calibration.py | 77 +-
quapy/classification/svmperf.py | 1 +
quapy/data/datasets.py | 24 +
quapy/data/preprocessing.py | 9 +-
quapy/depr_evaluation.py | 439 -
quapy/error.py | 5 -
quapy/functional.py | 27 +-
quapy/method/aggregative.py | 58 +-
quapy/method/base.py | 23 +-
quapy/method/meta.py | 8 +-
quapy/method/non_aggregative.py | 27 -
quapy/model_selection.py | 2 +-
quapy/protocol.py | 69 +-
quapy/tests/test_methods.py | 17 +-
quapy/tests/test_modsel.py | 22 +-
quapy/tests/test_protocols.py | 6 +-
quapy/util.py | 18 +-
59 files changed, 3593 insertions(+), 15605 deletions(-)
delete mode 100644 docs/build/html/_sources/quapy.tests.rst.txt
delete mode 100644 docs/build/html/_sources/readme.rst.txt
delete mode 100644 docs/build/html/_sources/readme2.md.txt
delete mode 100644 docs/build/html/_static/alabaster.css
delete mode 100644 docs/build/html/_static/custom.css
delete mode 100644 docs/build/html/_static/jquery-3.5.1.js
delete mode 100644 docs/build/html/quapy.tests.html
delete mode 100644 docs/build/html/readme.html
delete mode 100644 docs/build/html/readme2.html
create mode 100644 examples/custom_quantifier.py
delete mode 100644 quapy/depr_evaluation.py
diff --git a/docs/build/html/Datasets.html b/docs/build/html/Datasets.html
index 6af836e..9c9eaa7 100644
--- a/docs/build/html/Datasets.html
+++ b/docs/build/html/Datasets.html
@@ -2,23 +2,26 @@
-
+
-
- Datasets — QuaPy 0.1.6 documentation
+
+
+ Datasets — QuaPy 0.1.7 documentation
+
+
-
-
+
+
-
-
Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all
training instances via cross-validation, and then retrains the classifier on all training instances.
-The posterior probabilities thus generated are used for calibrating the outpus of the classifier.
+The posterior probabilities thus generated are used for calibrating the outputs of the classifier.
Parameters:
diff --git a/docs/build/html/quapy.data.html b/docs/build/html/quapy.data.html
index 644725f..52a2d9c 100644
--- a/docs/build/html/quapy.data.html
+++ b/docs/build/html/quapy.data.html
@@ -222,10 +222,10 @@ the collection), prevs (the prevalence values for each class)
A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
-routines.
+
A LabelledCollection is a set of objects each with a label attached to each of them.
+This class implements several sampling routines and other utilities.
Parameters:
@@ -252,7 +252,7 @@ from the labels. The classes must be indicated in cases in which some of the lab
Return a random sample (an instance of LabelledCollection) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
@@ -386,6 +403,7 @@ the actual prevalence of the class, or with replacement otherwise.
it is constrained. E.g., for binary collections, only the prevalence p for the first class (as listed in
self.classes_ can be specified, while the other class takes prevalence value 1-p
shuffle – if set to True (default), shuffles the index before returning it
Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
@@ -425,6 +443,7 @@ the actual prevalence of the class, or with replacement otherwise.
it is constrained. E.g., for binary collections, only the prevalence p for the first class (as listed in
self.classes_ can be specified, while the other class takes prevalence value 1-p
shuffle – if set to True (default), shuffles the index before returning it
+
random_state – seed for reproducing sampling
Returns:
diff --git a/docs/build/html/quapy.html b/docs/build/html/quapy.html
index 682e83f..d72b33d 100644
--- a/docs/build/html/quapy.html
+++ b/docs/build/html/quapy.html
@@ -502,7 +502,7 @@ will be taken from the environment variable SAMPLE_SIZE (which has
Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing n_prevalences points (e.g.,
@@ -520,7 +520,8 @@ qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is rai
grid (default is 21)
repeats – number of copies for each valid prevalence vector (default is 10)
smooth_limits_epsilon – the quantity to add and subtract to the limits 0 and 1
-
random_state – allows replicating samples across runs (default None)
+
random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples
+will be the same every time the protocol is called)
return_type – set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or
to “labelled_collection” to get instead instances of LabelledCollection
@@ -604,7 +605,7 @@ in the grid multiplied by repeat
An AbstractStochasticSeededProtocol is a protocol that generates, via any random procedure (e.g.,
via random sampling), sequences of quapy.data.base.LabelledCollection samples.
@@ -616,8 +617,8 @@ needed for extracting the samples, and
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
@@ -675,7 +676,8 @@ will be taken from the domain A (default).
mixture_points – an integer indicating the number of points to take from a linear scale (e.g., 21 will
generate the mixture points [1, 0.95, 0.9, …, 0]), or the array of mixture values itself.
the specific points
-
random_state –
+
random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples
+will be the same every time the protocol is called)
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
@@ -730,7 +732,8 @@ samples uniformly at random, therefore approximately preserving the natural prev
sample_size – integer, the number of instances in each sample; if None (default) then it is taken from
qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.
repeats – the number of samples to generate. Default is 100.
-
random_state – allows replicating samples across runs (default None)
+
random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples
+will be the same every time the protocol is called)
return_type – set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or
to “labelled_collection” to get instead instances of LabelledCollection
@@ -802,7 +805,7 @@ to “labelled_collection” to get instead instances of LabelledCollection<
A variant of APP that, instead of using a grid of equidistant prevalence values,
relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
@@ -817,7 +820,8 @@ combinations of the grid values of APP makes this endeavour intractable.
sample_size – integer, the number of instances in each sample; if None (default) then it is taken from
qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.
repeats – the number of samples to generate. Default is 100.
-
random_state – allows replicating samples across runs (default None)
+
random_state – allows replicating samples across runs (default 0, meaning that the sequence of samples
+will be the same every time the protocol is called)
return_type – set to “sample_prev” (default) to get the pairs of (sample, prevalence) at each iteration, or
to “labelled_collection” to get instead instances of LabelledCollection
diff --git a/docs/build/html/quapy.method.html b/docs/build/html/quapy.method.html
index a3ba728..19a1e0b 100644
--- a/docs/build/html/quapy.method.html
+++ b/docs/build/html/quapy.method.html
@@ -781,9 +781,9 @@ validation data, or as an integer, indicating that the misclassification rates s
Allows any binary quantifier to perform quantification on single-label datasets.
The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the
class prevelences sum up to 1.
@@ -795,12 +795,15 @@ This variant was used, along with the
-
Class labels, in the same order in which class prevalence values are to be computed.
-This default implementation actually returns the class labels of the learner.
If the base quantifier is not probabilistic, returns a matrix of shape (n,m,) with n the number of
instances and m the number of classes. The entry (i,j) is a binary value indicating whether instance
i `belongs to class `j. The binary classifications are independent of each other, meaning that an instance
@@ -845,63 +836,6 @@ probabilities are independent of each other, meaning that, in general, they do n
The method works on simple estimators as well as on nested objects
-(such as Pipeline). The latter have
-parameters of the form <component>__<parameter> so that it’s
-possible to update each component of a nested object.
-
-
Parameters:
-
**params (dict) – Estimator parameters.
-
-
Returns:
-
self – Estimator instance.
-
-
Return type:
-
estimator instance
-
-
-
-
@@ -1362,38 +1296,57 @@ validation data, or as an integer, indicating that the misclassification rates s
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
v321+p$-IFdfQu4R
zaC`WP1NL3ExAoyR_2LKTO+W*|Wzb9**PW-5rOzMxHLDJZe1BwrRpU(K00@sQPb1EC
zlnyyw!vrBvvzP4*AO6FRhOb1UK^2M9U`PQ?%>Xz-E?T99OgkAaQ6963r^9DN_|`9IxE{0;ny@$ael$-
zgn_H=>i*r`K!4w&q_G`sk39L>k}htedS(-rl`)Z{Qr>P>7zpRyfnd!#{jjq&>ipxt
z-Ek-1BVmp3SOQPr)cH1jAVwc!@#?ixP_S1X1nl6bj2w4=WyHC98Y4{j)-A(CKjP}(
z7+8+{4~!g8BE2QP%7*lfq;$zxGT9xc(VIRl8JUv~Cx7#CQemo(Sa8feZUHzuQ!O}WM#O?5JZe)WyMfHj#W8Dir`<0t3$$pY8RDHwIcQS
zs5$m^Cfi$1H=?6iJ+KUdn-SfZV&D!!_U$j9&hjoZ2aI!m4P(VIp5-rq$OPPENhP#|aPdCoCen^Oy;OHC-QZ
zuo>lV)AHQ8jXmyV?9K8>*3mi}oQ_7ilC=A}t_nv3%+oEnz2aim=;ARa>YfFiqnt@Z
zHh)xfiaDCH&=q8#6nBzL*n6(txPj(0C2mr!DvQnxW#0N%v}9R#)fq1Ikb^y^orUZL
zX#A$89VddVc-*~u0B)s>;vVWZ9SE8HoUpie`?*(N)1{!yVE>$1
zqtyOtUG7m$cjd7TN;a?8Wk=Q9yBo!-OMhjFukDAXs_$+_DpR?g?vl@@c>Ajoqc8S$
z7jGsIhD)_e))1_()m2-NynM)8mC^1}m8!3nrS$XVH~V50s!`(KB#o67N~YQ}+5YeE
z|B7Dy9!1@#f84+MT7&tK+ImWnn$OHx0%G_+$$`o40jnMoUw}
z-e}5?y#mK?ctOm6UgJp6@TRh!`44lnu=*8dN3h+q(Y@x2&RT+%-#rp-Y-se_4=h#U
YLMPAd9xJbvLS=H@o2at?2hE+F&yp5w`Tzg`
diff --git a/docs/build/html/quapy.classification.html b/docs/build/html/quapy.classification.html
index 8e2a6b9..f5684c6 100644
--- a/docs/build/html/quapy.classification.html
+++ b/docs/build/html/quapy.classification.html
@@ -801,7 +801,7 @@ applied, meaning that if the longest document in the batch is shorter than
A wrapper for the SVM-perf package by Thorsten Joachims.
When using losses for quantification, the source code has to be patched. See
@@ -821,6 +821,8 @@ for further details.
C – trade-off between training error and margin (default 0.01)
verbose – set to True to print svm-perf std outputs
loss – the loss to optimize for. Available losses are “01”, “f1”, “kld”, “nkld”, “q”, “qacc”, “qf1”, “qgm”, “mae”, “mrae”.
+
host_folder – directory where to store the trained model; set to None (default) for using a tmp directory
+(temporal directories are automatically deleted)
Indicates the total number of samples that the protocol generates.
+
Returns the number of samples that will be generated
Returns:
-
The number of samples to generate if known, or None otherwise.
+
int
@@ -666,10 +666,11 @@ the sequence will be consistent every time the protocol is called.
Parameters:
-
domainA –
-
domainB –
-
sample_size –
-
repeats –
+
domainA – one domain, an object of qp.data.LabelledCollection
+
domainB – another domain, an object of qp.data.LabelledCollection
+
sample_size – integer, the number of instances in each sample; if None (default) then it is taken from
+qp.environ[“SAMPLE_SIZE”]. If this is not set, a ValueError exception is raised.
+
repeats – int, number of samples to draw for every mixture rate
prevalence – the prevalence to preserv along the mixtures. If specified, should be an array containing
one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence
will be taken from the domain A (default).
@@ -684,13 +685,13 @@ will be the same every time the protocol is called)
Returns a collator function, i.e., a function that prepares the yielded data
+
+
Parameters:
+
return_type – either ‘sample_prev’ (default) if the collator is requested to yield tuples of
+(sample, prevalence), or ‘labelled_collection’ when it is requested to yield instances of
+qp.data.LabelledCollection
+
+
Returns:
+
the collator function (a callable function that takes as input an instance of
+qp.data.LabelledCollection)
Returns a copy of this protocol that acts on a modified version of the original
+qp.data.LabelledCollection in which the original instances have been replaced
+with the outputs of a classifier for each instance. (This is convenient for speeding-up
+the evaluation procedures for many samples, by pre-classifying the instances in advance.)
+
+
Parameters:
+
+
pre_classifications – the predictions issued by a classifier, typically an array-like
+with shape (n_instances,) when the classifier is a hard one, or with shape
+(n_instances, n_classes) when the classifier is a probabilistic one.
+
in_place – whether or not to apply the modification in-place or in a new copy (default).
+
+
+
Returns:
+
a copy of this protocol
+
+
+
@@ -830,13 +867,13 @@ to “labelled_collection” to get instead instances of LabelledCollection<
Class of Explicit Loss Minimization (ELM) quantifiers.
-Quantifiers based on ELM represent a family of methods based on structured output learning;
-these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
-measure. This implementation relies on
-Joachims’ SVM perf structured output
-learning algorithm, which has to be installed and patched for the purpose (see this
-script).
-
-
Parameters:
-
-
svmperf_base – path to the folder containing the binary files of SVM perf
Provides the label predictions for the given instances. The predictions should respect the format expected by
-aggregate(), i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for
-non-probabilistic quantifiers
-
-
Parameters:
-
instances – array-like
-
-
Returns:
-
np.ndarray of shape (n_instances,) with label predictions
Allows any binary quantifier to perform quantification on single-label datasets.
The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the
@@ -1029,108 +953,6 @@ learner has been trained outside the quantifier.
SVM(NKLD), which attempts to minimize a version of the the Kullback-Leibler Divergence normalized
-via the logistic function, as proposed by
-Esuli et al. 2015.
-Equivalent to:
-
>>> ELM(svmperf_base,loss='nkld',**kwargs)
-
-
-
-
Parameters:
-
-
svmperf_base – path to the folder containing the binary files of SVM perf
SVM(Q), which attempts to minimize the Q loss combining a classification-oriented loss and a
-quantification-oriented loss, as proposed by
-Barranquero et al. 2015.
-Equivalent to:
-
>>> ELM(svmperf_base,loss='q',**kwargs)
-
-
-
-
Parameters:
-
-
svmperf_base – path to the folder containing the binary files of SVM perf
@@ -1247,6 +1069,162 @@ validation data, or as an integer, indicating that the misclassification rates s
quapy.method.aggregative.cross_generate_predictions_depr(data, classifier, val_split, probabilistic, fit_classifier, method_name='')¶
Explicit Loss Minimization (ELM) quantifiers.
+Quantifiers based on ELM represent a family of methods based on structured output learning;
+these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
+measure. This implementation relies on
+Joachims’ SVM perf structured output
+learning algorithm, which has to be installed and patched for the purpose (see this
+script).
+This function equivalent to:
+
>>> CC(SVMperf(svmperf_base,loss,C))
+
+
+
+
Parameters:
+
+
svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default)
+this path will be obtained from qp.environ[‘SVMPERF_HOME’]
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by
+Moreo and Sebastiani, 2021.
+Equivalent to:
+
>>> CC(SVMperf(svmperf_base,loss='mae',C=C))
+
+
+
Quantifiers based on ELM represent a family of methods based on structured output learning;
+these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
+measure. This implementation relies on
+Joachims’ SVM perf structured output
+learning algorithm, which has to be installed and patched for the purpose (see this
+script).
+This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
+
+
Parameters:
+
+
svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default)
+this path will be obtained from qp.environ[‘SVMPERF_HOME’]
+
C – trade-off between training error and margin (default 0.01)
+
+
+
Returns:
+
returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
+underlying classifier
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence
+normalized via the logistic function, as proposed by
+Esuli et al. 2015.
+Equivalent to:
+
>>> CC(SVMperf(svmperf_base,loss='nkld',C=C))
+
+
+
Quantifiers based on ELM represent a family of methods based on structured output learning;
+these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
+measure. This implementation relies on
+Joachims’ SVM perf structured output
+learning algorithm, which has to be installed and patched for the purpose (see this
+script).
+This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
+
+
Parameters:
+
+
svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default)
+this path will be obtained from qp.environ[‘SVMPERF_HOME’]
+
C – trade-off between training error and margin (default 0.01)
+
+
+
Returns:
+
returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
+underlying classifier
SVM(Q) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Q loss combining a
+classification-oriented loss and a quantification-oriented loss, as proposed by
+Barranquero et al. 2015.
+Equivalent to:
+
>>> CC(SVMperf(svmperf_base,loss='q',C=C))
+
+
+
Quantifiers based on ELM represent a family of methods based on structured output learning;
+these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
+measure. This implementation relies on
+Joachims’ SVM perf structured output
+learning algorithm, which has to be installed and patched for the purpose (see this
+script).
+This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
+
+
Parameters:
+
+
svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default)
+this path will be obtained from qp.environ[‘SVMPERF_HOME’]
+
C – trade-off between training error and margin (default 0.01)
+
+
+
Returns:
+
returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
+underlying classifier
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first
+used by Moreo and Sebastiani, 2021.
+Equivalent to:
+
>>> CC(SVMperf(svmperf_base,loss='mrae',C=C))
+
+
+
Quantifiers based on ELM represent a family of methods based on structured output learning;
+these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
+measure. This implementation relies on
+Joachims’ SVM perf structured output
+learning algorithm, which has to be installed and patched for the purpose (see this
+script).
+This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
+
+
Parameters:
+
+
svmperf_base – path to the folder containing the binary files of SVM perf; if set to None (default)
+this path will be obtained from qp.environ[‘SVMPERF_HOME’]
+
C – trade-off between training error and margin (default 0.01)
+
+
+
Returns:
+
returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
+underlying classifier
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
@@ -1343,8 +1321,8 @@ quantifier for each class, and then l1-normalizes the outputs so that the class