1
0
Fork 0

many changes, see change log

This commit is contained in:
Alejandro Moreo Fernandez 2022-05-25 19:14:33 +02:00
parent 46e3632200
commit 4bc9d19635
14 changed files with 754 additions and 492 deletions

34
quapy/CHANGE_LOG.txt Normal file
View File

@ -0,0 +1,34 @@
# main changes in 0.1.7
- Protocols is now an abstraction, AbstractProtocol. There is a new class extending AbstractProtocol called
AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings.
There are some examples of protocols, APP, NPP, USimplexPP, CovariateShiftPP (experimental).
The idea is to start the sampling by simpli calling the __call__ method.
This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection,
and sampling functions in LabelledCollection make use of the old functions.
- ACC, PACC, Forman's threshold variants have been parallelized.
Things to fix:
- eval budget policy?
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
internally and not imposed in any abstract class)
- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
- update unit tests
- Policies should be able to set their output to "labelled_collection" or "instances_prevalence" or something similar.
- Policies should implement the "gen()" one, taking a reader function as an input, and a folder path maybe
- Review all documentation, redo the Sphinx doc, update Wikis...
- Resolve the OneVsAll thing (it is in base.py and in aggregative.py
- Better handle the environment (e.g., with n_jobs)
- test cross_generate_predictions and cancel cross_generate_predictions_depr
- Add a proper log?
- test LoadSamplesFromDirectory (in protocols.py)
- improve plots?
- I have removed the distinction between "classify" and "posterior_probabilities" in the Aggregative quantifiers,
so that probabilistic classifiers actually return posterior probabilities, while non-probabilistic quantifiers
return instead crisp decisions. The idea was to unify the quantification function (i.e., now it is always
classify & aggregate, irrespective of the class). However, this has caused a problem with OneVsAll. This has to
be checked, since it is now innecessarily complicated (it also has old references to .probabilistic, and all this
stuff).

View File

@ -2,13 +2,13 @@ from . import error
from . import data
from quapy.data import datasets
from . import functional
from . import method
# from . import method
from . import evaluation
from . import protocol
from . import plot
from . import util
from . import model_selection
from . import classification
from quapy.method.base import isprobabilistic, isaggregative
__version__ = '0.1.7'
@ -21,5 +21,4 @@ environ = {
'SVMPERF_HOME': './svm_perf_quantification'
}
def isbinary(x):
return x.binary

View File

@ -210,10 +210,12 @@ class LabelledCollection:
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
second one with `1-train_prop` elements
"""
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
tr_docs, te_docs, tr_labels, te_labels = train_test_split(
self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state
)
training = LabelledCollection(tr_docs, tr_labels, classes_=self.classes_)
test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
return training, test
def __add__(self, other):
"""
@ -418,13 +420,3 @@ class Dataset:
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
def isbinary(data):
"""
Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
:param data: a :class:`Dataset` or a :class:`LabelledCollection` object
:return: True if labelled according to two classes
"""
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary
return False

102
quapy/evaluation.py Normal file
View File

@ -0,0 +1,102 @@
from typing import Union, Callable, Iterable
import numpy as np
from tqdm import tqdm
import inspect
import quapy as qp
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
from quapy.util import temp_seed
import quapy.functional as F
import pandas as pd
def prediction(model: BaseQuantifier, protocol: AbstractProtocol, verbose=False):
sout = lambda x: print(x) if verbose else None
from method.aggregative import AggregativeQuantifier
if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
sout('speeding up the prediction for the aggregative quantifier')
pre_classified = model.classify(protocol.get_labelled_collection().instances)
return __prediction_helper(model.aggregate, protocol.on_preclassified_instances(pre_classified), verbose)
else:
sout(f'the method is not aggregative, or the protocol is not an instance of '
f'{OnLabelledCollectionProtocol.__name__}, so no optimization can be carried out')
return __prediction_helper(model.quantify, protocol, verbose)
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
true_prevs, estim_prevs = [], []
for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
estim_prevs.append(quantification_fn(sample.instances))
true_prevs.append(sample.prevalence())
true_prevs = np.asarray(true_prevs)
estim_prevs = np.asarray(estim_prevs)
return true_prevs, estim_prevs
def evaluation_report(model: BaseQuantifier,
protocol: AbstractProtocol,
error_metrics:Iterable[Union[str,Callable]]='mae',
verbose=False):
true_prevs, estim_prevs = prediction(model, protocol, verbose)
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[str, Callable]] = 'mae'):
if isinstance(error_metrics, str):
error_metrics = [error_metrics]
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
error_names = [e.__name__ for e in error_funcs]
df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names)
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
for error_name, error_metric in zip(error_names, error_funcs):
score = error_metric(true_prev, estim_prev)
series[error_name] = score
df = df.append(series, ignore_index=True)
return df
def evaluate(model: BaseQuantifier, protocol: AbstractProtocol, error_metric:Union[str, Callable], verbose=False):
if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric)
true_prevs, estim_prevs = prediction(model, protocol, verbose)
return error_metric(true_prevs, estim_prevs)
def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, repeats=1, verbose=False):
if n_prevpoints is None and eval_budget is None:
raise ValueError('either n_prevpoints or eval_budget has to be specified')
elif n_prevpoints is None:
assert eval_budget > 0, 'eval_budget must be a positive integer'
n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
if verbose:
print(f'setting n_prevpoints={n_prevpoints} so that the number of '
f'evaluations ({eval_computations}) does not exceed the evaluation '
f'budget ({eval_budget})')
elif eval_budget is None:
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
if verbose:
print(f'{eval_computations} evaluations will be performed for each '
f'combination of hyper-parameters')
else:
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
if eval_computations > eval_budget:
n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
new_eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
if verbose:
print(f'the budget of evaluations would be exceeded with '
f'n_prevpoints={n_prevpoints}. Chaning to n_prevpoints={n_prevpoints}. This will produce '
f'{new_eval_computations} evaluation computations for each hyper-parameter combination.')
return n_prevpoints, eval_computations

View File

@ -1,15 +1,13 @@
from abc import abstractmethod
from copy import deepcopy
from typing import Union
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
import quapy as qp
import quapy.functional as F
from quapy.classification.svmperf import SVMperf
@ -61,7 +59,9 @@ class AggregativeQuantifier(BaseQuantifier):
def classify(self, instances):
"""
Provides the label predictions for the given instances.
Provides the label predictions for the given instances. The predictions should respect the format expected by
:meth:`aggregate`, i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for
non-probabilistic quantifiers
:param instances: array-like
:return: np.ndarray of shape `(n_instances,)` with label predictions
@ -118,16 +118,6 @@ class AggregativeQuantifier(BaseQuantifier):
"""
return self.learner.classes_
@property
def aggregative(self):
"""
Returns True, indicating the quantifier is of type aggregative.
:return: True
"""
return True
class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
"""
@ -137,28 +127,25 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
probabilities.
"""
def posterior_probabilities(self, instances):
def classify(self, instances):
return self.learner.predict_proba(instances)
def predict_proba(self, instances):
return self.posterior_probabilities(instances)
def quantify(self, instances):
classif_posteriors = self.posterior_probabilities(instances)
return self.aggregate(classif_posteriors)
def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV):
parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
self.learner.set_params(**parameters)
@property
def probabilistic(self):
return True
# Helper
# ------------------------------------
def _ensure_probabilistic(learner):
if not hasattr(learner, 'predict_proba'):
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.')
learner = CalibratedClassifierCV(learner, cv=5)
return learner
def _training_helper(learner,
data: LabelledCollection,
fit_learner: bool = True,
@ -180,10 +167,7 @@ def _training_helper(learner,
"""
if fit_learner:
if ensure_probabilistic:
if not hasattr(learner, 'predict_proba'):
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.')
learner = CalibratedClassifierCV(learner, cv=5)
learner = _ensure_probabilistic(learner)
if val_split is not None:
if isinstance(val_split, float):
if not (0 < val_split < 1):
@ -214,6 +198,89 @@ def _training_helper(learner,
return learner, unused
def cross_generate_predictions(
data,
learner,
val_split,
probabilistic,
fit_learner,
n_jobs
):
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
if probabilistic:
learner = _ensure_probabilistic(learner)
predict = 'predict_proba'
else:
predict = 'predict'
y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
class_count = data.counts()
# fit the learner on all data
learner.fit(*data.Xy)
classes = data.classes_
else:
learner, val_data = _training_helper(
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
)
y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances)
y = val_data.labels
classes = val_data.classes_
class_count = val_data.counts()
return learner, y, y_pred, classes, class_count
def cross_generate_predictions_depr(
data,
learner,
val_split,
probabilistic,
fit_learner,
method_name=''
):
predict = learner.predict_proba if probabilistic else learner.predict
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{method_name}\tfitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = _training_helper(
learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation
)
y_.append(predict(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.concatenate(y_)
class_count = data.counts()
# fit the learner on all data
learner, _ = _training_helper(
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None
)
classes = data.classes_
else:
learner, val_data = _training_helper(
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
)
y_ = predict(val_data.instances)
y = val_data.labels
classes = val_data.classes_
class_count = val_data.counts()
return learner, y, y_, classes, class_count
# Methods
# ------------------------------------
class CC(AggregativeQuantifier):
@ -264,9 +331,10 @@ class ACC(AggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, learner: BaseEstimator, val_split=0.4):
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
self.learner = learner
self.val_split = val_split
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
"""
@ -280,44 +348,33 @@ class ACC(AggregativeQuantifier):
cross validation to estimate the parameters
:return: self
"""
if val_split is None:
val_split = self.val_split
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
y_.append(learner.predict(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.concatenate(y_)
class_count = data.counts()
# fit the learner on all data
self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
else:
self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
y_ = self.learner.predict(val_data.instances)
y = val_data.labels
class_count = val_data.counts()
self.learner, y, y_, classes, class_count = cross_generate_predictions(
data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs
)
self.cc = CC(self.learner)
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
return self
@classmethod
def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
conf = confusion_matrix(y, y_, labels=classes).T
conf = conf.astype(np.float)
class_counts = conf.sum(axis=0)
for i, _ in enumerate(classes):
if class_counts[i] == 0:
conf[i, i] = 1
else:
conf[:, i] /= class_counts[i]
return conf
def classify(self, data):
return self.cc.classify(data)
@ -380,9 +437,10 @@ class PACC(AggregativeProbabilisticQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, learner: BaseEstimator, val_split=0.4):
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
self.learner = learner
self.val_split = val_split
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
"""
@ -396,52 +454,31 @@ class PACC(AggregativeProbabilisticQuantifier):
to estimate the parameters
:return: self
"""
if val_split is None:
val_split = self.val_split
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = _training_helper(
self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
y_.append(learner.predict_proba(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.vstack(y_)
# fit the learner on all data
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
val_split=None)
classes = data.classes_
else:
self.learner, val_data = _training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
y_ = self.learner.predict_proba(val_data.instances)
y = val_data.labels
classes = val_data.classes_
self.learner, y, y_, classes, class_count = cross_generate_predictions(
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
)
self.pcc = PCC(self.learner)
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self
@classmethod
def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
n_classes = len(classes)
confusion = np.empty(shape=(n_classes, n_classes))
confusion = np.eye(n_classes)
for i, class_ in enumerate(classes):
confusion[i] = y_[y == class_].mean(axis=0)
idx = y == class_
if idx.any():
confusion[i] = y_[idx].mean(axis=0)
self.Pte_cond_estim_ = confusion.T
return self
return confusion.T
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
@ -557,7 +594,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self._check_binary(data, self.__class__.__name__)
self.learner, validation = _training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
# pre-compute the histogram for positive and negative examples
@ -732,44 +769,24 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, learner: BaseEstimator, val_split=0.4):
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
self.learner = learner
self.val_split = val_split
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
self._check_binary(data, "Threshold Optimization")
if val_split is None:
val_split = self.val_split
if isinstance(val_split, int):
assert fit_learner == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
# kFCV estimation of parameters
y, probabilities = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
probabilities.append(learner.predict_proba(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
probabilities = np.concatenate(probabilities)
# fit the learner on all data
self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
else:
self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
probabilities = self.learner.predict_proba(val_data.instances)
y = val_data.labels
self.learner, y, y_, classes, class_count = cross_generate_predictions(
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
)
self.cc = CC(self.learner)
self.tpr, self.fpr = self._optimize_threshold(y, probabilities)
self.tpr, self.fpr = self._optimize_threshold(y, y_)
return self
@ -828,7 +845,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
def _compute_tpr(self, TP, FP):
if TP + FP == 0:
return 0
return 1
return TP / (TP + FP)
def _compute_fpr(self, FP, TN):
@ -1022,54 +1039,59 @@ class OneVsAll(AggregativeQuantifier):
def classify(self, instances):
"""
Returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry
`(i,j)` is a binary value indicating whether instance `i `belongs to class `j`. The binary classifications are
independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
`i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
can end up be attributed to 0, 1, or more classes.
If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
probabilities are independent of each other, meaning that, in general, they do not sum up to one.
:param instances: array-like
:return: `np.ndarray`
"""
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T
def posterior_probabilities(self, instances):
"""
Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
`(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
(resp. does not belong) to class `j`.
The posterior probabilities are independent of each other, meaning that, in general, they do not sum
up to one.
:param instances: array-like
:return: `np.ndarray`
"""
if not self.binary_quantifier.probabilistic:
raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
f'probabilistic')
posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
return np.swapaxes(posterior_predictions_bin, 0, 1)
def aggregate(self, classif_predictions_bin):
if self.probabilistic:
assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
'probabilities (2 dimensions) for each document (row) and class (columns)'
classif_predictions = self.__parallel(self._delayed_binary_classification, instances)
if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
return np.swapaxes(classif_predictions, 0, 1)
else:
assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
return classif_predictions.T
#
# def posterior_probabilities(self, instances):
# """
# Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
# `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
# (resp. does not belong) to class `j`.
# The posterior probabilities are independent of each other, meaning that, in general, they do not sum
# up to one.
#
# :param instances: array-like
# :return: `np.ndarray`
# """
#
# if not isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
# raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
# f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
# f'probabilistic')
# posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
# return np.swapaxes(posterior_predictions_bin, 0, 1)
def aggregate(self, classif_predictions):
# if self.probabilistic:
# assert classif_predictions.shape[1] == self.n_classes and classif_predictions.shape[2] == 2, \
# 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
# 'probabilities (2 dimensions) for each document (row) and class (columns)'
# else:
# assert set(np.unique(classif_predictions)).issubset({0, 1}), \
# 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
# 'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions)
return F.normalize_prevalence(prevalences)
def quantify(self, X):
if self.probabilistic:
predictions = self.posterior_probabilities(X)
else:
predictions = self.classify(X)
return self.aggregate(predictions)
# def quantify(self, X):
# predictions = self.classify(X)
# return self.aggregate(predictions)
def __parallel(self, func, *args, **kwargs):
return np.asarray(
@ -1093,9 +1115,6 @@ class OneVsAll(AggregativeQuantifier):
def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X)
def _delayed_binary_posteriors(self, c, X):
return self.dict_binary_quantifiers[c].posterior_probabilities(X)
def _delayed_binary_aggregate(self, c, classif_predictions):
# the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
@ -1104,21 +1123,3 @@ class OneVsAll(AggregativeQuantifier):
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
self.dict_binary_quantifiers[c].fit(bindata)
@property
def binary(self):
"""
Informs that the classifier is not binary
:return: False
"""
return False
@property
def probabilistic(self):
"""
Indicates if the classifier is probabilistic or not (depending on the nature of the base classifier).
:return: boolean
"""
return self.binary_quantifier.probabilistic

View File

@ -51,56 +51,6 @@ class BaseQuantifier(metaclass=ABCMeta):
"""
...
@property
@abstractmethod
def classes_(self):
"""
Class labels, in the same order in which class prevalence values are to be computed.
:return: array-like
"""
...
@property
def n_classes(self):
"""
Returns the number of classes
:return: integer
"""
return len(self.classes_)
# these methods allows meta-learners to reimplement the decision based on their constituents, and not
# based on class structure
@property
def binary(self):
"""
Indicates whether the quantifier is binary or not.
:return: False (to be overridden)
"""
return False
@property
def aggregative(self):
"""
Indicates whether the quantifier is of type aggregative or not
:return: False (to be overridden)
"""
return False
@property
def probabilistic(self):
"""
Indicates whether the quantifier is of type probabilistic or not
:return: False (to be overridden)
"""
return False
class BinaryQuantifier(BaseQuantifier):
"""
@ -112,46 +62,8 @@ class BinaryQuantifier(BaseQuantifier):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
@property
def binary(self):
"""
Informs that the quantifier is binary
:return: True
"""
return True
def isbinary(model:BaseQuantifier):
"""
Alias for property `binary`
:param model: the model
:return: True if the model is binary, False otherwise
"""
return model.binary
def isaggregative(model:BaseQuantifier):
"""
Alias for property `aggregative`
:param model: the model
:return: True if the model is aggregative, False otherwise
"""
return model.aggregative
def isprobabilistic(model:BaseQuantifier):
"""
Alias for property `probabilistic`
:param model: the model
:return: True if the model is probabilistic, False otherwise
"""
return model.probabilistic
# class OneVsAll:

View File

@ -234,19 +234,6 @@ class Ensemble(BaseQuantifier):
order = np.argsort(dist)
return _select_k(predictions, order, k=self.red_size)
@property
def classes_(self):
return self.base_quantifier.classes_
@property
def binary(self):
"""
Returns a boolean indicating whether the base quantifiers are binary or not
:return: boolean
"""
return self.base_quantifier.binary
@property
def aggregative(self):
"""

View File

@ -191,7 +191,7 @@ class QuaNetTrainer(BaseQuantifier):
label_predictions = np.argmax(posteriors, axis=-1)
prevs_estim = []
for quantifier in self.quantifiers.values():
predictions = posteriors if quantifier.probabilistic else label_predictions
predictions = posteriors if isinstance(quantifier, AggregativeProbabilisticQuantifier) else label_predictions
prevs_estim.extend(quantifier.aggregate(predictions))
# there is no real need for adding static estims like the TPR or FPR from training since those are constant

View File

@ -2,14 +2,12 @@ import itertools
import signal
from copy import deepcopy
from typing import Union, Callable
import numpy as np
import evaluation
import quapy as qp
from protocol import AbstractProtocol, OnLabelledCollectionProtocol
from quapy.data.base import LabelledCollection
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
from quapy.method.aggregative import BaseQuantifier
import inspect
from time import time
class GridSearchQ(BaseQuantifier):
@ -21,33 +19,11 @@ class GridSearchQ(BaseQuantifier):
:param model: the quantifier to optimize
:type model: BaseQuantifier
:param param_grid: a dictionary with keys the parameter names and values the list of values to explore
:param sample_size: the size of the samples to extract from the validation set (ignored if protocol='gen')
:param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
protocol, or 'gen' for using a custom sampling generator function
:param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
[0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested.
Ignored if protocol!='app'.
:param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
for the protocol='app' if eval_budget is set and is lower than the number of combinations that would be
generated using the value assigned to n_prevpoints (for the current number of classes and n_repetitions).
Ignored for protocol='npp' and protocol='gen' (use eval_budget for setting a maximum number of samples in
those cases).
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
combination. For example, if protocol='app', there are 3 classes, n_repetitions=1 and eval_budget=20, then
n_prevpoints will be set to 5, since this will generate 15 different prevalences, i.e., [0, 0, 1],
[0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0], and since setting it to 6 would generate more than
20. When protocol='gen', indicates the maximum number of samples to generate, but less samples will be
generated if the generator yields less samples.
:param protocol:
:param error: an error function (callable) or a string indicating the name of an error function (valid ones
are those in qp.error.QUANTIFICATION_ERROR
:param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
the best chosen hyperparameter combination. Ignored if protocol='gen'
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
a float in [0,1] indicating the proportion of labelled data to extract from the training set, or a callable
returning a generator function each time it is invoked (only for protocol='gen').
:param n_jobs: number of parallel jobs
:param random_seed: set the seed of the random generator to replicate experiments. Ignored if protocol='gen'.
:param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
@ -57,65 +33,27 @@ class GridSearchQ(BaseQuantifier):
def __init__(self,
model: BaseQuantifier,
param_grid: dict,
sample_size: Union[int, None],
protocol='app',
n_prevpoints: int = None,
n_repetitions: int = 1,
eval_budget: int = None,
protocol: AbstractProtocol,
error: Union[Callable, str] = qp.error.mae,
refit=True,
val_split=0.4,
n_jobs=1,
random_seed=42,
timeout=-1,
n_jobs=1,
verbose=False):
self.model = model
self.param_grid = param_grid
self.sample_size = sample_size
self.protocol = protocol.lower()
self.n_prevpoints = n_prevpoints
self.n_repetitions = n_repetitions
self.eval_budget = eval_budget
self.protocol = protocol
self.refit = refit
self.val_split = val_split
self.n_jobs = n_jobs
self.random_seed = random_seed
self.timeout = timeout
self.n_jobs = n_jobs
self.verbose = verbose
self.__check_error(error)
assert self.protocol in {'app', 'npp', 'gen'}, \
'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
'sample (instances) and their prevalence (ndarray) at each iteration.'
assert self.eval_budget is None or isinstance(self.eval_budget, int)
if self.protocol in ['npp', 'gen']:
if self.protocol=='npp' and (self.eval_budget is None or self.eval_budget <= 0):
raise ValueError(f'when protocol="npp" the parameter eval_budget should be '
f'indicated (and should be >0).')
if self.n_repetitions != 1:
print('[warning] n_repetitions has been set and will be ignored for the selected protocol')
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
def _sout(self, msg):
if self.verbose:
print(f'[{self.__class__.__name__}]: {msg}')
def __check_training_validation(self, training, validation):
if isinstance(validation, LabelledCollection):
return training, validation
elif isinstance(validation, float):
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
training, validation = training.split_stratified(train_prop=1 - validation)
return training, validation
elif self.protocol=='gen' and inspect.isgenerator(validation()):
return training, validation
else:
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
f'proportion of training documents to extract (type found: {type(validation)}). '
f'Optionally, "validation" can be a callable function returning a generator that yields '
f'the sample instances along with their true prevalence at each iteration by '
f'setting protocol="gen".')
def __check_error(self, error):
if error in qp.error.QUANTIFICATION_ERROR:
self.error = error
@ -127,96 +65,86 @@ class GridSearchQ(BaseQuantifier):
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
def __generate_predictions(self, model, val_split):
commons = {
'n_repetitions': self.n_repetitions,
'n_jobs': self.n_jobs,
'random_seed': self.random_seed,
'verbose': False
}
if self.protocol == 'app':
return artificial_prevalence_prediction(
model, val_split, self.sample_size,
n_prevpoints=self.n_prevpoints,
eval_budget=self.eval_budget,
**commons
)
elif self.protocol == 'npp':
return natural_prevalence_prediction(
model, val_split, self.sample_size,
**commons)
elif self.protocol == 'gen':
return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
else:
raise ValueError('unknown protocol')
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
def fit(self, training: LabelledCollection):
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
a float in [0,1] indicating the proportion of labelled data to extract from the training set
:return: self
"""
if val_split is None:
val_split = self.val_split
training, val_split = self.__check_training_validation(training, val_split)
if self.protocol != 'gen':
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values())
model = self.model
if self.timeout > 0:
def handler(signum, frame):
self._sout('timeout reached')
raise TimeoutError()
signal.signal(signal.SIGALRM, handler)
protocol = self.protocol
n_jobs = self.n_jobs
self.param_scores_ = {}
self.best_score_ = None
some_timeouts = False
for values in itertools.product(*params_values):
params = dict({k: values[i] for i, k in enumerate(params_keys)})
if self.timeout > 0:
signal.alarm(self.timeout)
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
try:
# overrides default parameters with the parameters being explored at this iteration
model.set_params(**params)
model.fit(training)
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
score = self.error(true_prevalences, estim_prevalences)
self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
for params, score, model in scores:
if score is not None:
if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score
self.best_params_ = params
self.best_model_ = deepcopy(model)
self.best_model_ = model
self.param_scores_[str(params)] = score
else:
self.param_scores_[str(params)] = 'timeout'
if self.timeout > 0:
signal.alarm(0)
except TimeoutError:
print(f'timeout reached for config {params}')
some_timeouts = True
if self.best_score_ is None and some_timeouts:
if self.best_score_ is None:
raise TimeoutError('all jobs took more than the timeout time to end')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
if self.refit:
self._sout(f'refitting on the whole development set')
self.best_model_.fit(training + val_split)
if isinstance(protocol, OnLabelledCollectionProtocol):
self._sout(f'refitting on the whole development set')
self.best_model_.fit(training + protocol.get_labelled_collection())
else:
raise RuntimeWarning(f'"refit" was requested, but the protocol does not '
f'implement the {OnLabelledCollectionProtocol.__name__} interface')
return self
def _delayed_eval(self, args):
params, training = args
protocol = self.protocol
error = self.error
if self.timeout > 0:
def handler(signum, frame):
raise TimeoutError()
signal.signal(signal.SIGALRM, handler)
tinit = time()
if self.timeout > 0:
signal.alarm(self.timeout)
try:
model = deepcopy(self.model)
# overrides default parameters with the parameters being explored at this iteration
model.set_params(**params)
model.fit(training)
score = evaluation.evaluate(model, protocol=protocol, error_metric=error)
ttime = time()-tinit
self._sout(f'hyperparams={params}\t got {error.__name__} score {score:.5f} [took {ttime:.4f}s]')
if self.timeout > 0:
signal.alarm(0)
except TimeoutError:
self._sout(f'timeout ({self.timeout}s) reached for config {params}')
score = None
return params, score, model
def quantify(self, instances):
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
@ -227,14 +155,6 @@ class GridSearchQ(BaseQuantifier):
assert hasattr(self, 'best_model_'), 'quantify called before fit'
return self.best_model().quantify(instances)
@property
def classes_(self):
"""
Classes on which the quantifier has been trained on.
:return: a ndarray of shape `(n_classes)` with the class identifiers
"""
return self.best_model().classes_
def set_params(self, **parameters):
"""Sets the hyper-parameters to explore.
@ -260,3 +180,5 @@ class GridSearchQ(BaseQuantifier):
if hasattr(self, 'best_model_'):
return self.best_model_
raise ValueError('best_model called before fit')

View File

@ -1,12 +1,16 @@
from copy import deepcopy
import quapy as qp
import numpy as np
import itertools
from collections.abc import Generator
from contextlib import ExitStack
from abc import ABCMeta, abstractmethod
from quapy.data import LabelledCollection
import quapy.functional as F
from tqdm import tqdm
from os.path import exists
from glob import glob
# 0.1.7
@ -61,6 +65,8 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
the sequence will be different every time the protocol is called.
"""
_random_seed = -1 # means "not set"
def __init__(self, seed=None):
self.random_seed = seed
@ -93,13 +99,47 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
def __call__(self):
with ExitStack() as stack:
if self.random_seed == -1:
raise ValueError('The random seed has never been initialized. '
'Set it to None not to impose replicability.')
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
for params in self.samples_parameters():
yield self.sample(params)
class APP(AbstractStochasticSeededProtocol):
class OnLabelledCollectionProtocol:
def get_labelled_collection(self):
return self.data
def on_preclassified_instances(self, pre_classifications, in_place=False):
assert len(pre_classifications) == len(self.data), \
f'error: the pre-classified data has different shape ' \
f'(expected {len(self.data)}, found {len(pre_classifications)})'
if in_place:
self.data.instances = pre_classifications
return self
else:
new = deepcopy(self)
return new.on_preclassified_instances(pre_classifications, in_place=True)
class LoadSamplesFromDirectory(AbstractProtocol):
def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
assert exists(folder_path), f'folder {folder_path} does not exist'
assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
self.folder_path = folder_path
self.loader_fn = loader_fn
self.classes = classes
self.loader_kwargs = loader_kwargs
def __call__(self):
for file in sorted(glob(self.folder_path, '*')):
yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
@ -123,7 +163,7 @@ class APP(AbstractStochasticSeededProtocol):
self.n_prevalences = n_prevalences
self.repeats = repeats
def prevalence_grid(self, dimensions):
def prevalence_grid(self):
"""
Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
@ -134,14 +174,14 @@ class APP(AbstractStochasticSeededProtocol):
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
:param dimensions: the number of classes
:return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
`(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
in the grid multiplied by `repeat`
"""
dimensions = self.data.n_classes
s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
s = [s] * (dimensions - 1)
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) <= 1.0)]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
if self.repeats > 1:
prevs = np.repeat(prevs, self.repeats, axis=0)
@ -149,8 +189,8 @@ class APP(AbstractStochasticSeededProtocol):
def samples_parameters(self):
indexes = []
for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
index = data.sampling_index(self.sample_size, *prevs)
for prevs in self.prevalence_grid():
index = self.data.sampling_index(self.sample_size, *prevs)
indexes.append(index)
return indexes
@ -161,7 +201,7 @@ class APP(AbstractStochasticSeededProtocol):
return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
class NPP(AbstractStochasticSeededProtocol):
class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
@ -182,7 +222,7 @@ class NPP(AbstractStochasticSeededProtocol):
def samples_parameters(self):
indexes = []
for _ in range(self.repeats):
index = data.uniform_sampling_index(self.sample_size)
index = self.data.uniform_sampling_index(self.sample_size)
indexes.append(index)
return indexes
@ -193,8 +233,7 @@ class NPP(AbstractStochasticSeededProtocol):
return self.repeats
class USimplexPP(AbstractStochasticSeededProtocol):
class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
"""
A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
@ -218,8 +257,8 @@ class USimplexPP(AbstractStochasticSeededProtocol):
def samples_parameters(self):
indexes = []
for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
index = data.sampling_index(self.sample_size, *prevs)
for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats):
index = self.data.sampling_index(self.sample_size, *prevs)
indexes.append(index)
return indexes
@ -230,7 +269,6 @@ class USimplexPP(AbstractStochasticSeededProtocol):
return self.repeats
class CovariateShiftPP(AbstractStochasticSeededProtocol):
"""
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
@ -300,33 +338,3 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
return self.repeats * len(self.mixture_points)
if __name__=='__main__':
import numpy as np
import quapy as qp
# domainA
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
X = ['A:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
# domain B
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
X = ['B:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
dataB = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
# p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
# p = NPP(data, sample_size=10, repeats=10, random_seed=42)
# p = NPP(data, sample_size=10, repeats=10)
# p = USimplexPP(data, sample_size=10, repeats=10)
p = CovariateShiftPP(data, dataB, sample_size=10, mixture_points=11, random_seed=1)
for _ in range(2):
print('init generator', p.__class__.__name__)
for i in tqdm(p(), total=p.total()):
# print(i)
print(i.instances, i.labels, i.prevalence())
print('done')

View File

@ -0,0 +1,57 @@
import unittest
import quapy as qp
from sklearn.linear_model import LogisticRegression
from time import time
from method.aggregative import EMQ
from method.base import BaseQuantifier
class EvalTestCase(unittest.TestCase):
def test_eval_speedup(self):
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
train, test = data.training, data.test
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
class SlowLR(LogisticRegression):
def predict_proba(self, X):
import time
time.sleep(1)
return super().predict_proba(X)
emq = EMQ(SlowLR()).fit(train)
tinit = time()
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
tend_optim = time()-tinit
print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
class NonAggregativeEMQ(BaseQuantifier):
def __init__(self, cls):
self.emq = EMQ(cls)
def quantify(self, instances):
return self.emq.quantify(instances)
def fit(self, data):
self.emq.fit(data)
return self
def set_params(self, **parameters): pass
def get_params(self, deep=True): pass
emq = NonAggregativeEMQ(SlowLR()).fit(train)
tinit = time()
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
tend_no_optim = time() - tinit
print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
self.assertEqual(tend_no_optim>tend_optim, True)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,32 @@
import unittest
from sklearn.linear_model import LogisticRegression
import quapy as qp
from quapy.method.aggregative import *
class HierarchyTestCase(unittest.TestCase):
def test_aggregative(self):
lr = LogisticRegression()
for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]:
self.assertEqual(isinstance(m, AggregativeQuantifier), True)
def test_binary(self):
lr = LogisticRegression()
for m in [HDy(lr)]:
self.assertEqual(isinstance(m, BinaryQuantifier), True)
def test_probabilistic(self):
lr = LogisticRegression()
for m in [CC(lr), ACC(lr)]:
self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), False)
for m in [PCC(lr), PACC(lr)]:
self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), True)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,77 @@
import unittest
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import quapy as qp
from method.aggregative import PACC
from model_selection import GridSearchQ
from protocol import APP
class ModselTestCase(unittest.TestCase):
def test_modsel(self):
q = PACC(LogisticRegression(random_state=1, max_iter=5000))
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
training, validation = data.training.split_stratified(0.7, random_state=1)
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
).fit(training)
print('best params', q.best_params_)
print('best score', q.best_score_)
self.assertEqual(q.best_params_['C'], 10.0)
self.assertEqual(q.best_model().get_params()['C'], 10.0)
def test_modsel_parallel(self):
q = PACC(LogisticRegression(random_state=1, max_iter=5000))
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
training, validation = data.training.split_stratified(0.7, random_state=1)
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
).fit(training)
print('best params', q.best_params_)
print('best score', q.best_score_)
self.assertEqual(q.best_params_['C'], 10.0)
self.assertEqual(q.best_model().get_params()['C'], 10.0)
def test_modsel_timeout(self):
class SlowLR(LogisticRegression):
def fit(self, X, y, sample_weight=None):
import time
time.sleep(10)
super(SlowLR, self).fit(X, y, sample_weight)
q = PACC(SlowLR())
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
training, validation = data.training.split_stratified(0.7, random_state=1)
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
)
with self.assertRaises(TimeoutError):
q.fit(training)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,139 @@
import unittest
import numpy as np
from data import LabelledCollection
from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
def mock_labelled_collection(prefix=''):
y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250
X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)]
return LabelledCollection(X, y, classes_=sorted(np.unique(y)))
def samples_to_str(protocol):
samples_str = ""
for sample in protocol():
samples_str += f'{sample.instances}\t{sample.labels}\t{sample.prevalence()}\n'
return samples_str
class TestProtocols(unittest.TestCase):
def test_app_replicate(self):
data = mock_labelled_collection()
p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertEqual(samples1, samples2)
def test_app_not_replicate(self):
data = mock_labelled_collection()
p = APP(data, sample_size=5, n_prevalences=11)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertNotEqual(samples1, samples2)
def test_app_number(self):
data = mock_labelled_collection()
p = APP(data, sample_size=100, n_prevalences=10, repeats=1)
# surprisingly enough, for some n_prevalences the test fails, notwithstanding
# everything is correct. The problem is that in function APP.prevalence_grid()
# there is sometimes one rounding error that gets cumulated and
# surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like)
# so these tuples are mistakenly removed... I have tried with np.close, and
# other workarounds, but eventually happens that there is some negative probability
# in the sampling function...
count = 0
for _ in p():
count+=1
self.assertEqual(count, p.total())
def test_npp_replicate(self):
data = mock_labelled_collection()
p = NPP(data, sample_size=5, repeats=5, random_seed=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertEqual(samples1, samples2)
def test_npp_not_replicate(self):
data = mock_labelled_collection()
p = NPP(data, sample_size=5, repeats=5)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertNotEqual(samples1, samples2)
def test_kraemer_replicate(self):
data = mock_labelled_collection()
p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertEqual(samples1, samples2)
def test_kraemer_not_replicate(self):
data = mock_labelled_collection()
p = USimplexPP(data, sample_size=5, repeats=10)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertNotEqual(samples1, samples2)
def test_covariate_shift_replicate(self):
dataA = mock_labelled_collection('domA')
dataB = mock_labelled_collection('domB')
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertEqual(samples1, samples2)
def test_covariate_shift_not_replicate(self):
dataA = mock_labelled_collection('domA')
dataB = mock_labelled_collection('domB')
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
self.assertNotEqual(samples1, samples2)
def test_no_seed_init(self):
class NoSeedInit(AbstractStochasticSeededProtocol):
def __init__(self):
self.data = mock_labelled_collection()
def samples_parameters(self):
# return a matrix containing sampling indexes in the rows
return np.random.randint(0, len(self.data), 10*10).reshape(10, 10)
def sample(self, params):
index = np.unique(params)
return self.data.sampling_from_index(index)
p = NoSeedInit()
# this should raise a ValueError, since the class is said to be AbstractStochasticSeededProtocol but the
# random_seed has never been passed to super(NoSeedInit, self).__init__(random_seed)
with self.assertRaises(ValueError):
for sample in p():
pass
print('done')
if __name__ == '__main__':
unittest.main()