forked from moreo/QuaPy
many changes, see change log
This commit is contained in:
parent
46e3632200
commit
4bc9d19635
|
@ -0,0 +1,34 @@
|
|||
# main changes in 0.1.7
|
||||
|
||||
- Protocols is now an abstraction, AbstractProtocol. There is a new class extending AbstractProtocol called
|
||||
AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings.
|
||||
There are some examples of protocols, APP, NPP, USimplexPP, CovariateShiftPP (experimental).
|
||||
The idea is to start the sampling by simpli calling the __call__ method.
|
||||
This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection,
|
||||
and sampling functions in LabelledCollection make use of the old functions.
|
||||
|
||||
- ACC, PACC, Forman's threshold variants have been parallelized.
|
||||
|
||||
|
||||
Things to fix:
|
||||
- eval budget policy?
|
||||
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
|
||||
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
|
||||
internally and not imposed in any abstract class)
|
||||
- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
|
||||
- update unit tests
|
||||
- Policies should be able to set their output to "labelled_collection" or "instances_prevalence" or something similar.
|
||||
- Policies should implement the "gen()" one, taking a reader function as an input, and a folder path maybe
|
||||
- Review all documentation, redo the Sphinx doc, update Wikis...
|
||||
- Resolve the OneVsAll thing (it is in base.py and in aggregative.py
|
||||
- Better handle the environment (e.g., with n_jobs)
|
||||
- test cross_generate_predictions and cancel cross_generate_predictions_depr
|
||||
- Add a proper log?
|
||||
- test LoadSamplesFromDirectory (in protocols.py)
|
||||
- improve plots?
|
||||
- I have removed the distinction between "classify" and "posterior_probabilities" in the Aggregative quantifiers,
|
||||
so that probabilistic classifiers actually return posterior probabilities, while non-probabilistic quantifiers
|
||||
return instead crisp decisions. The idea was to unify the quantification function (i.e., now it is always
|
||||
classify & aggregate, irrespective of the class). However, this has caused a problem with OneVsAll. This has to
|
||||
be checked, since it is now innecessarily complicated (it also has old references to .probabilistic, and all this
|
||||
stuff).
|
|
@ -2,13 +2,13 @@ from . import error
|
|||
from . import data
|
||||
from quapy.data import datasets
|
||||
from . import functional
|
||||
from . import method
|
||||
# from . import method
|
||||
from . import evaluation
|
||||
from . import protocol
|
||||
from . import plot
|
||||
from . import util
|
||||
from . import model_selection
|
||||
from . import classification
|
||||
from quapy.method.base import isprobabilistic, isaggregative
|
||||
|
||||
__version__ = '0.1.7'
|
||||
|
||||
|
@ -21,5 +21,4 @@ environ = {
|
|||
'SVMPERF_HOME': './svm_perf_quantification'
|
||||
}
|
||||
|
||||
def isbinary(x):
|
||||
return x.binary
|
||||
|
||||
|
|
|
@ -210,10 +210,12 @@ class LabelledCollection:
|
|||
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
|
||||
second one with `1-train_prop` elements
|
||||
"""
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
|
||||
random_state=random_state)
|
||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||
tr_docs, te_docs, tr_labels, te_labels = train_test_split(
|
||||
self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state
|
||||
)
|
||||
training = LabelledCollection(tr_docs, tr_labels, classes_=self.classes_)
|
||||
test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
|
||||
return training, test
|
||||
|
||||
def __add__(self, other):
|
||||
"""
|
||||
|
@ -418,13 +420,3 @@ class Dataset:
|
|||
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
|
||||
|
||||
|
||||
def isbinary(data):
|
||||
"""
|
||||
Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
|
||||
|
||||
:param data: a :class:`Dataset` or a :class:`LabelledCollection` object
|
||||
:return: True if labelled according to two classes
|
||||
"""
|
||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||
return data.binary
|
||||
return False
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
from typing import Union, Callable, Iterable
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import inspect
|
||||
import quapy as qp
|
||||
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.base import BaseQuantifier
|
||||
from quapy.util import temp_seed
|
||||
import quapy.functional as F
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def prediction(model: BaseQuantifier, protocol: AbstractProtocol, verbose=False):
|
||||
sout = lambda x: print(x) if verbose else None
|
||||
from method.aggregative import AggregativeQuantifier
|
||||
if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
|
||||
sout('speeding up the prediction for the aggregative quantifier')
|
||||
pre_classified = model.classify(protocol.get_labelled_collection().instances)
|
||||
return __prediction_helper(model.aggregate, protocol.on_preclassified_instances(pre_classified), verbose)
|
||||
else:
|
||||
sout(f'the method is not aggregative, or the protocol is not an instance of '
|
||||
f'{OnLabelledCollectionProtocol.__name__}, so no optimization can be carried out')
|
||||
return __prediction_helper(model.quantify, protocol, verbose)
|
||||
|
||||
|
||||
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
|
||||
true_prevs, estim_prevs = [], []
|
||||
for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
|
||||
estim_prevs.append(quantification_fn(sample.instances))
|
||||
true_prevs.append(sample.prevalence())
|
||||
|
||||
true_prevs = np.asarray(true_prevs)
|
||||
estim_prevs = np.asarray(estim_prevs)
|
||||
|
||||
return true_prevs, estim_prevs
|
||||
|
||||
|
||||
def evaluation_report(model: BaseQuantifier,
|
||||
protocol: AbstractProtocol,
|
||||
error_metrics:Iterable[Union[str,Callable]]='mae',
|
||||
verbose=False):
|
||||
|
||||
true_prevs, estim_prevs = prediction(model, protocol, verbose)
|
||||
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
|
||||
|
||||
|
||||
def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[str, Callable]] = 'mae'):
|
||||
|
||||
if isinstance(error_metrics, str):
|
||||
error_metrics = [error_metrics]
|
||||
|
||||
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
|
||||
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
|
||||
error_names = [e.__name__ for e in error_funcs]
|
||||
|
||||
df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names)
|
||||
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
|
||||
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
|
||||
for error_name, error_metric in zip(error_names, error_funcs):
|
||||
score = error_metric(true_prev, estim_prev)
|
||||
series[error_name] = score
|
||||
df = df.append(series, ignore_index=True)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def evaluate(model: BaseQuantifier, protocol: AbstractProtocol, error_metric:Union[str, Callable], verbose=False):
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qp.error.from_name(error_metric)
|
||||
true_prevs, estim_prevs = prediction(model, protocol, verbose)
|
||||
return error_metric(true_prevs, estim_prevs)
|
||||
|
||||
|
||||
|
||||
def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, repeats=1, verbose=False):
|
||||
if n_prevpoints is None and eval_budget is None:
|
||||
raise ValueError('either n_prevpoints or eval_budget has to be specified')
|
||||
elif n_prevpoints is None:
|
||||
assert eval_budget > 0, 'eval_budget must be a positive integer'
|
||||
n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
|
||||
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
|
||||
if verbose:
|
||||
print(f'setting n_prevpoints={n_prevpoints} so that the number of '
|
||||
f'evaluations ({eval_computations}) does not exceed the evaluation '
|
||||
f'budget ({eval_budget})')
|
||||
elif eval_budget is None:
|
||||
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
|
||||
if verbose:
|
||||
print(f'{eval_computations} evaluations will be performed for each '
|
||||
f'combination of hyper-parameters')
|
||||
else:
|
||||
eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
|
||||
if eval_computations > eval_budget:
|
||||
n_prevpoints = F.get_nprevpoints_approximation(eval_budget, n_classes, repeats)
|
||||
new_eval_computations = F.num_prevalence_combinations(n_prevpoints, n_classes, repeats)
|
||||
if verbose:
|
||||
print(f'the budget of evaluations would be exceeded with '
|
||||
f'n_prevpoints={n_prevpoints}. Chaning to n_prevpoints={n_prevpoints}. This will produce '
|
||||
f'{new_eval_computations} evaluation computations for each hyper-parameter combination.')
|
||||
return n_prevpoints, eval_computations
|
||||
|
|
@ -1,15 +1,13 @@
|
|||
from abc import abstractmethod
|
||||
from copy import deepcopy
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import StratifiedKFold, cross_val_predict
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from quapy.classification.svmperf import SVMperf
|
||||
|
@ -61,7 +59,9 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
|
||||
def classify(self, instances):
|
||||
"""
|
||||
Provides the label predictions for the given instances.
|
||||
Provides the label predictions for the given instances. The predictions should respect the format expected by
|
||||
:meth:`aggregate`, i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for
|
||||
non-probabilistic quantifiers
|
||||
|
||||
:param instances: array-like
|
||||
:return: np.ndarray of shape `(n_instances,)` with label predictions
|
||||
|
@ -118,16 +118,6 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
"""
|
||||
return self.learner.classes_
|
||||
|
||||
@property
|
||||
def aggregative(self):
|
||||
"""
|
||||
Returns True, indicating the quantifier is of type aggregative.
|
||||
|
||||
:return: True
|
||||
"""
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
||||
"""
|
||||
|
@ -137,28 +127,25 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
|||
probabilities.
|
||||
"""
|
||||
|
||||
def posterior_probabilities(self, instances):
|
||||
def classify(self, instances):
|
||||
return self.learner.predict_proba(instances)
|
||||
|
||||
def predict_proba(self, instances):
|
||||
return self.posterior_probabilities(instances)
|
||||
|
||||
def quantify(self, instances):
|
||||
classif_posteriors = self.posterior_probabilities(instances)
|
||||
return self.aggregate(classif_posteriors)
|
||||
|
||||
def set_params(self, **parameters):
|
||||
if isinstance(self.learner, CalibratedClassifierCV):
|
||||
parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
|
||||
self.learner.set_params(**parameters)
|
||||
|
||||
@property
|
||||
def probabilistic(self):
|
||||
return True
|
||||
|
||||
|
||||
# Helper
|
||||
# ------------------------------------
|
||||
def _ensure_probabilistic(learner):
|
||||
if not hasattr(learner, 'predict_proba'):
|
||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||
f'The learner will be calibrated.')
|
||||
learner = CalibratedClassifierCV(learner, cv=5)
|
||||
return learner
|
||||
|
||||
|
||||
def _training_helper(learner,
|
||||
data: LabelledCollection,
|
||||
fit_learner: bool = True,
|
||||
|
@ -180,10 +167,7 @@ def _training_helper(learner,
|
|||
"""
|
||||
if fit_learner:
|
||||
if ensure_probabilistic:
|
||||
if not hasattr(learner, 'predict_proba'):
|
||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||
f'The learner will be calibrated.')
|
||||
learner = CalibratedClassifierCV(learner, cv=5)
|
||||
learner = _ensure_probabilistic(learner)
|
||||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
if not (0 < val_split < 1):
|
||||
|
@ -214,6 +198,89 @@ def _training_helper(learner,
|
|||
return learner, unused
|
||||
|
||||
|
||||
def cross_generate_predictions(
|
||||
data,
|
||||
learner,
|
||||
val_split,
|
||||
probabilistic,
|
||||
fit_learner,
|
||||
n_jobs
|
||||
):
|
||||
|
||||
if isinstance(val_split, int):
|
||||
assert fit_learner == True, \
|
||||
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
||||
|
||||
if probabilistic:
|
||||
learner = _ensure_probabilistic(learner)
|
||||
predict = 'predict_proba'
|
||||
else:
|
||||
predict = 'predict'
|
||||
y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
|
||||
class_count = data.counts()
|
||||
|
||||
# fit the learner on all data
|
||||
learner.fit(*data.Xy)
|
||||
classes = data.classes_
|
||||
else:
|
||||
learner, val_data = _training_helper(
|
||||
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
|
||||
)
|
||||
y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances)
|
||||
y = val_data.labels
|
||||
classes = val_data.classes_
|
||||
class_count = val_data.counts()
|
||||
|
||||
return learner, y, y_pred, classes, class_count
|
||||
|
||||
|
||||
def cross_generate_predictions_depr(
|
||||
data,
|
||||
learner,
|
||||
val_split,
|
||||
probabilistic,
|
||||
fit_learner,
|
||||
method_name=''
|
||||
):
|
||||
predict = learner.predict_proba if probabilistic else learner.predict
|
||||
if isinstance(val_split, int):
|
||||
assert fit_learner == True, \
|
||||
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{method_name}\tfitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = _training_helper(
|
||||
learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation
|
||||
)
|
||||
y_.append(predict(val_data.instances))
|
||||
y.append(val_data.labels)
|
||||
|
||||
y = np.concatenate(y)
|
||||
y_ = np.concatenate(y_)
|
||||
class_count = data.counts()
|
||||
|
||||
# fit the learner on all data
|
||||
learner, _ = _training_helper(
|
||||
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None
|
||||
)
|
||||
classes = data.classes_
|
||||
|
||||
else:
|
||||
learner, val_data = _training_helper(
|
||||
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
|
||||
)
|
||||
y_ = predict(val_data.instances)
|
||||
y = val_data.labels
|
||||
classes = val_data.classes_
|
||||
class_count = val_data.counts()
|
||||
|
||||
return learner, y, y_, classes, class_count
|
||||
|
||||
# Methods
|
||||
# ------------------------------------
|
||||
class CC(AggregativeQuantifier):
|
||||
|
@ -264,9 +331,10 @@ class ACC(AggregativeQuantifier):
|
|||
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
||||
"""
|
||||
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
"""
|
||||
|
@ -280,44 +348,33 @@ class ACC(AggregativeQuantifier):
|
|||
cross validation to estimate the parameters
|
||||
:return: self
|
||||
"""
|
||||
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
if isinstance(val_split, int):
|
||||
assert fit_learner == True, \
|
||||
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||
y_.append(learner.predict(val_data.instances))
|
||||
y.append(val_data.labels)
|
||||
|
||||
y = np.concatenate(y)
|
||||
y_ = np.concatenate(y_)
|
||||
class_count = data.counts()
|
||||
|
||||
# fit the learner on all data
|
||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
|
||||
|
||||
else:
|
||||
self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||
y_ = self.learner.predict(val_data.instances)
|
||||
y = val_data.labels
|
||||
class_count = val_data.counts()
|
||||
self.learner, y, y_, classes, class_count = cross_generate_predictions(
|
||||
data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs
|
||||
)
|
||||
|
||||
self.cc = CC(self.learner)
|
||||
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
|
||||
self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def getPteCondEstim(cls, classes, y, y_):
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
conf = confusion_matrix(y, y_, labels=classes).T
|
||||
conf = conf.astype(np.float)
|
||||
class_counts = conf.sum(axis=0)
|
||||
for i, _ in enumerate(classes):
|
||||
if class_counts[i] == 0:
|
||||
conf[i, i] = 1
|
||||
else:
|
||||
conf[:, i] /= class_counts[i]
|
||||
return conf
|
||||
|
||||
def classify(self, data):
|
||||
return self.cc.classify(data)
|
||||
|
||||
|
@ -380,9 +437,10 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
||||
"""
|
||||
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
"""
|
||||
|
@ -396,52 +454,31 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
to estimate the parameters
|
||||
:return: self
|
||||
"""
|
||||
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
||||
if isinstance(val_split, int):
|
||||
assert fit_learner == True, \
|
||||
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = _training_helper(
|
||||
self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
|
||||
y_.append(learner.predict_proba(val_data.instances))
|
||||
y.append(val_data.labels)
|
||||
|
||||
y = np.concatenate(y)
|
||||
y_ = np.vstack(y_)
|
||||
|
||||
# fit the learner on all data
|
||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
|
||||
val_split=None)
|
||||
classes = data.classes_
|
||||
|
||||
else:
|
||||
self.learner, val_data = _training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||
y_ = self.learner.predict_proba(val_data.instances)
|
||||
y = val_data.labels
|
||||
classes = val_data.classes_
|
||||
self.learner, y, y_, classes, class_count = cross_generate_predictions(
|
||||
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
|
||||
)
|
||||
|
||||
self.pcc = PCC(self.learner)
|
||||
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def getPteCondEstim(cls, classes, y, y_):
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
n_classes = len(classes)
|
||||
confusion = np.empty(shape=(n_classes, n_classes))
|
||||
confusion = np.eye(n_classes)
|
||||
for i, class_ in enumerate(classes):
|
||||
confusion[i] = y_[y == class_].mean(axis=0)
|
||||
idx = y == class_
|
||||
if idx.any():
|
||||
confusion[i] = y_[idx].mean(axis=0)
|
||||
|
||||
self.Pte_cond_estim_ = confusion.T
|
||||
|
||||
return self
|
||||
return confusion.T
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
||||
|
@ -557,7 +594,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
self._check_binary(data, self.__class__.__name__)
|
||||
self.learner, validation = _training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||
Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
||||
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
||||
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
|
||||
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
|
||||
# pre-compute the histogram for positive and negative examples
|
||||
|
@ -732,44 +769,24 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
|
|||
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
||||
"""
|
||||
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=1):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
self._check_binary(data, "Threshold Optimization")
|
||||
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
if isinstance(val_split, int):
|
||||
assert fit_learner == True, \
|
||||
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
||||
# kFCV estimation of parameters
|
||||
y, probabilities = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||
probabilities.append(learner.predict_proba(val_data.instances))
|
||||
y.append(val_data.labels)
|
||||
|
||||
y = np.concatenate(y)
|
||||
probabilities = np.concatenate(probabilities)
|
||||
|
||||
# fit the learner on all data
|
||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None)
|
||||
|
||||
else:
|
||||
self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||
probabilities = self.learner.predict_proba(val_data.instances)
|
||||
y = val_data.labels
|
||||
self.learner, y, y_, classes, class_count = cross_generate_predictions(
|
||||
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
|
||||
)
|
||||
|
||||
self.cc = CC(self.learner)
|
||||
|
||||
self.tpr, self.fpr = self._optimize_threshold(y, probabilities)
|
||||
self.tpr, self.fpr = self._optimize_threshold(y, y_)
|
||||
|
||||
return self
|
||||
|
||||
|
@ -828,7 +845,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
|
|||
|
||||
def _compute_tpr(self, TP, FP):
|
||||
if TP + FP == 0:
|
||||
return 0
|
||||
return 1
|
||||
return TP / (TP + FP)
|
||||
|
||||
def _compute_fpr(self, FP, TN):
|
||||
|
@ -1022,54 +1039,59 @@ class OneVsAll(AggregativeQuantifier):
|
|||
|
||||
def classify(self, instances):
|
||||
"""
|
||||
Returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry
|
||||
`(i,j)` is a binary value indicating whether instance `i `belongs to class `j`. The binary classifications are
|
||||
independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
|
||||
If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
|
||||
instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
|
||||
`i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
|
||||
can end up be attributed to 0, 1, or more classes.
|
||||
If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
|
||||
and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
|
||||
posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
|
||||
probabilities are independent of each other, meaning that, in general, they do not sum up to one.
|
||||
|
||||
:param instances: array-like
|
||||
:return: `np.ndarray`
|
||||
"""
|
||||
|
||||
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
||||
return classif_predictions_bin.T
|
||||
|
||||
def posterior_probabilities(self, instances):
|
||||
"""
|
||||
Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
|
||||
`(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
|
||||
(resp. does not belong) to class `j`.
|
||||
The posterior probabilities are independent of each other, meaning that, in general, they do not sum
|
||||
up to one.
|
||||
|
||||
:param instances: array-like
|
||||
:return: `np.ndarray`
|
||||
"""
|
||||
|
||||
if not self.binary_quantifier.probabilistic:
|
||||
raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
|
||||
f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
|
||||
f'probabilistic')
|
||||
posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
|
||||
return np.swapaxes(posterior_predictions_bin, 0, 1)
|
||||
|
||||
def aggregate(self, classif_predictions_bin):
|
||||
if self.probabilistic:
|
||||
assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \
|
||||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
|
||||
'probabilities (2 dimensions) for each document (row) and class (columns)'
|
||||
classif_predictions = self.__parallel(self._delayed_binary_classification, instances)
|
||||
if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
|
||||
return np.swapaxes(classif_predictions, 0, 1)
|
||||
else:
|
||||
assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \
|
||||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||
'predictions for each document (row) and class (columns)'
|
||||
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
|
||||
return classif_predictions.T
|
||||
#
|
||||
# def posterior_probabilities(self, instances):
|
||||
# """
|
||||
# Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry
|
||||
# `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs
|
||||
# (resp. does not belong) to class `j`.
|
||||
# The posterior probabilities are independent of each other, meaning that, in general, they do not sum
|
||||
# up to one.
|
||||
#
|
||||
# :param instances: array-like
|
||||
# :return: `np.ndarray`
|
||||
# """
|
||||
#
|
||||
# if not isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
|
||||
# raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
|
||||
# f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
|
||||
# f'probabilistic')
|
||||
# posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
|
||||
# return np.swapaxes(posterior_predictions_bin, 0, 1)
|
||||
|
||||
def aggregate(self, classif_predictions):
|
||||
# if self.probabilistic:
|
||||
# assert classif_predictions.shape[1] == self.n_classes and classif_predictions.shape[2] == 2, \
|
||||
# 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
|
||||
# 'probabilities (2 dimensions) for each document (row) and class (columns)'
|
||||
# else:
|
||||
# assert set(np.unique(classif_predictions)).issubset({0, 1}), \
|
||||
# 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||
# 'predictions for each document (row) and class (columns)'
|
||||
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions)
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
def quantify(self, X):
|
||||
if self.probabilistic:
|
||||
predictions = self.posterior_probabilities(X)
|
||||
else:
|
||||
predictions = self.classify(X)
|
||||
return self.aggregate(predictions)
|
||||
# def quantify(self, X):
|
||||
# predictions = self.classify(X)
|
||||
# return self.aggregate(predictions)
|
||||
|
||||
def __parallel(self, func, *args, **kwargs):
|
||||
return np.asarray(
|
||||
|
@ -1093,9 +1115,6 @@ class OneVsAll(AggregativeQuantifier):
|
|||
def _delayed_binary_classification(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].classify(X)
|
||||
|
||||
def _delayed_binary_posteriors(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].posterior_probabilities(X)
|
||||
|
||||
def _delayed_binary_aggregate(self, c, classif_predictions):
|
||||
# the estimation for the positive class prevalence
|
||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||
|
@ -1104,21 +1123,3 @@ class OneVsAll(AggregativeQuantifier):
|
|||
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
|
||||
self.dict_binary_quantifiers[c].fit(bindata)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
"""
|
||||
Informs that the classifier is not binary
|
||||
|
||||
:return: False
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def probabilistic(self):
|
||||
"""
|
||||
Indicates if the classifier is probabilistic or not (depending on the nature of the base classifier).
|
||||
|
||||
:return: boolean
|
||||
"""
|
||||
|
||||
return self.binary_quantifier.probabilistic
|
||||
|
|
|
@ -51,56 +51,6 @@ class BaseQuantifier(metaclass=ABCMeta):
|
|||
"""
|
||||
...
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def classes_(self):
|
||||
"""
|
||||
Class labels, in the same order in which class prevalence values are to be computed.
|
||||
|
||||
:return: array-like
|
||||
"""
|
||||
...
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
"""
|
||||
Returns the number of classes
|
||||
|
||||
:return: integer
|
||||
"""
|
||||
return len(self.classes_)
|
||||
|
||||
# these methods allows meta-learners to reimplement the decision based on their constituents, and not
|
||||
# based on class structure
|
||||
@property
|
||||
def binary(self):
|
||||
"""
|
||||
Indicates whether the quantifier is binary or not.
|
||||
|
||||
:return: False (to be overridden)
|
||||
"""
|
||||
return False
|
||||
|
||||
@property
|
||||
def aggregative(self):
|
||||
"""
|
||||
Indicates whether the quantifier is of type aggregative or not
|
||||
|
||||
:return: False (to be overridden)
|
||||
"""
|
||||
|
||||
return False
|
||||
|
||||
@property
|
||||
def probabilistic(self):
|
||||
"""
|
||||
Indicates whether the quantifier is of type probabilistic or not
|
||||
|
||||
:return: False (to be overridden)
|
||||
"""
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class BinaryQuantifier(BaseQuantifier):
|
||||
"""
|
||||
|
@ -112,46 +62,8 @@ class BinaryQuantifier(BaseQuantifier):
|
|||
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
|
||||
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
"""
|
||||
Informs that the quantifier is binary
|
||||
|
||||
:return: True
|
||||
"""
|
||||
return True
|
||||
|
||||
|
||||
def isbinary(model:BaseQuantifier):
|
||||
"""
|
||||
Alias for property `binary`
|
||||
|
||||
:param model: the model
|
||||
:return: True if the model is binary, False otherwise
|
||||
"""
|
||||
return model.binary
|
||||
|
||||
|
||||
def isaggregative(model:BaseQuantifier):
|
||||
"""
|
||||
Alias for property `aggregative`
|
||||
|
||||
:param model: the model
|
||||
:return: True if the model is aggregative, False otherwise
|
||||
"""
|
||||
|
||||
return model.aggregative
|
||||
|
||||
|
||||
def isprobabilistic(model:BaseQuantifier):
|
||||
"""
|
||||
Alias for property `probabilistic`
|
||||
|
||||
:param model: the model
|
||||
:return: True if the model is probabilistic, False otherwise
|
||||
"""
|
||||
|
||||
return model.probabilistic
|
||||
|
||||
|
||||
# class OneVsAll:
|
||||
|
|
|
@ -234,19 +234,6 @@ class Ensemble(BaseQuantifier):
|
|||
order = np.argsort(dist)
|
||||
return _select_k(predictions, order, k=self.red_size)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.base_quantifier.classes_
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
"""
|
||||
Returns a boolean indicating whether the base quantifiers are binary or not
|
||||
|
||||
:return: boolean
|
||||
"""
|
||||
return self.base_quantifier.binary
|
||||
|
||||
@property
|
||||
def aggregative(self):
|
||||
"""
|
||||
|
|
|
@ -191,7 +191,7 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
label_predictions = np.argmax(posteriors, axis=-1)
|
||||
prevs_estim = []
|
||||
for quantifier in self.quantifiers.values():
|
||||
predictions = posteriors if quantifier.probabilistic else label_predictions
|
||||
predictions = posteriors if isinstance(quantifier, AggregativeProbabilisticQuantifier) else label_predictions
|
||||
prevs_estim.extend(quantifier.aggregate(predictions))
|
||||
|
||||
# there is no real need for adding static estims like the TPR or FPR from training since those are constant
|
||||
|
|
|
@ -2,14 +2,12 @@ import itertools
|
|||
import signal
|
||||
from copy import deepcopy
|
||||
from typing import Union, Callable
|
||||
|
||||
import numpy as np
|
||||
|
||||
import evaluation
|
||||
import quapy as qp
|
||||
from protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
||||
from quapy.data.base import LabelledCollection
|
||||
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
|
||||
from quapy.method.aggregative import BaseQuantifier
|
||||
import inspect
|
||||
from time import time
|
||||
|
||||
|
||||
class GridSearchQ(BaseQuantifier):
|
||||
|
@ -21,33 +19,11 @@ class GridSearchQ(BaseQuantifier):
|
|||
:param model: the quantifier to optimize
|
||||
:type model: BaseQuantifier
|
||||
:param param_grid: a dictionary with keys the parameter names and values the list of values to explore
|
||||
:param sample_size: the size of the samples to extract from the validation set (ignored if protocol='gen')
|
||||
:param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
|
||||
protocol, or 'gen' for using a custom sampling generator function
|
||||
:param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
|
||||
[0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
|
||||
each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested.
|
||||
Ignored if protocol!='app'.
|
||||
:param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
|
||||
for the protocol='app' if eval_budget is set and is lower than the number of combinations that would be
|
||||
generated using the value assigned to n_prevpoints (for the current number of classes and n_repetitions).
|
||||
Ignored for protocol='npp' and protocol='gen' (use eval_budget for setting a maximum number of samples in
|
||||
those cases).
|
||||
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
|
||||
combination. For example, if protocol='app', there are 3 classes, n_repetitions=1 and eval_budget=20, then
|
||||
n_prevpoints will be set to 5, since this will generate 15 different prevalences, i.e., [0, 0, 1],
|
||||
[0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0], and since setting it to 6 would generate more than
|
||||
20. When protocol='gen', indicates the maximum number of samples to generate, but less samples will be
|
||||
generated if the generator yields less samples.
|
||||
:param protocol:
|
||||
:param error: an error function (callable) or a string indicating the name of an error function (valid ones
|
||||
are those in qp.error.QUANTIFICATION_ERROR
|
||||
:param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
|
||||
the best chosen hyperparameter combination. Ignored if protocol='gen'
|
||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set, or a callable
|
||||
returning a generator function each time it is invoked (only for protocol='gen').
|
||||
:param n_jobs: number of parallel jobs
|
||||
:param random_seed: set the seed of the random generator to replicate experiments. Ignored if protocol='gen'.
|
||||
:param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
|
||||
Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
|
||||
being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
|
||||
|
@ -57,65 +33,27 @@ class GridSearchQ(BaseQuantifier):
|
|||
def __init__(self,
|
||||
model: BaseQuantifier,
|
||||
param_grid: dict,
|
||||
sample_size: Union[int, None],
|
||||
protocol='app',
|
||||
n_prevpoints: int = None,
|
||||
n_repetitions: int = 1,
|
||||
eval_budget: int = None,
|
||||
protocol: AbstractProtocol,
|
||||
error: Union[Callable, str] = qp.error.mae,
|
||||
refit=True,
|
||||
val_split=0.4,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
timeout=-1,
|
||||
n_jobs=1,
|
||||
verbose=False):
|
||||
|
||||
self.model = model
|
||||
self.param_grid = param_grid
|
||||
self.sample_size = sample_size
|
||||
self.protocol = protocol.lower()
|
||||
self.n_prevpoints = n_prevpoints
|
||||
self.n_repetitions = n_repetitions
|
||||
self.eval_budget = eval_budget
|
||||
self.protocol = protocol
|
||||
self.refit = refit
|
||||
self.val_split = val_split
|
||||
self.n_jobs = n_jobs
|
||||
self.random_seed = random_seed
|
||||
self.timeout = timeout
|
||||
self.n_jobs = n_jobs
|
||||
self.verbose = verbose
|
||||
self.__check_error(error)
|
||||
assert self.protocol in {'app', 'npp', 'gen'}, \
|
||||
'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
|
||||
'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
|
||||
'sample (instances) and their prevalence (ndarray) at each iteration.'
|
||||
assert self.eval_budget is None or isinstance(self.eval_budget, int)
|
||||
if self.protocol in ['npp', 'gen']:
|
||||
if self.protocol=='npp' and (self.eval_budget is None or self.eval_budget <= 0):
|
||||
raise ValueError(f'when protocol="npp" the parameter eval_budget should be '
|
||||
f'indicated (and should be >0).')
|
||||
if self.n_repetitions != 1:
|
||||
print('[warning] n_repetitions has been set and will be ignored for the selected protocol')
|
||||
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
|
||||
|
||||
def _sout(self, msg):
|
||||
if self.verbose:
|
||||
print(f'[{self.__class__.__name__}]: {msg}')
|
||||
|
||||
def __check_training_validation(self, training, validation):
|
||||
if isinstance(validation, LabelledCollection):
|
||||
return training, validation
|
||||
elif isinstance(validation, float):
|
||||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
||||
return training, validation
|
||||
elif self.protocol=='gen' and inspect.isgenerator(validation()):
|
||||
return training, validation
|
||||
else:
|
||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||
f'proportion of training documents to extract (type found: {type(validation)}). '
|
||||
f'Optionally, "validation" can be a callable function returning a generator that yields '
|
||||
f'the sample instances along with their true prevalence at each iteration by '
|
||||
f'setting protocol="gen".')
|
||||
|
||||
def __check_error(self, error):
|
||||
if error in qp.error.QUANTIFICATION_ERROR:
|
||||
self.error = error
|
||||
|
@ -127,96 +65,86 @@ class GridSearchQ(BaseQuantifier):
|
|||
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
|
||||
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
|
||||
|
||||
def __generate_predictions(self, model, val_split):
|
||||
commons = {
|
||||
'n_repetitions': self.n_repetitions,
|
||||
'n_jobs': self.n_jobs,
|
||||
'random_seed': self.random_seed,
|
||||
'verbose': False
|
||||
}
|
||||
if self.protocol == 'app':
|
||||
return artificial_prevalence_prediction(
|
||||
model, val_split, self.sample_size,
|
||||
n_prevpoints=self.n_prevpoints,
|
||||
eval_budget=self.eval_budget,
|
||||
**commons
|
||||
)
|
||||
elif self.protocol == 'npp':
|
||||
return natural_prevalence_prediction(
|
||||
model, val_split, self.sample_size,
|
||||
**commons)
|
||||
elif self.protocol == 'gen':
|
||||
return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
|
||||
else:
|
||||
raise ValueError('unknown protocol')
|
||||
|
||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
|
||||
def fit(self, training: LabelledCollection):
|
||||
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
||||
the error metric.
|
||||
|
||||
:param training: the training set on which to optimize the hyperparameters
|
||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
||||
:return: self
|
||||
"""
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
training, val_split = self.__check_training_validation(training, val_split)
|
||||
if self.protocol != 'gen':
|
||||
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
|
||||
|
||||
params_keys = list(self.param_grid.keys())
|
||||
params_values = list(self.param_grid.values())
|
||||
|
||||
model = self.model
|
||||
|
||||
if self.timeout > 0:
|
||||
def handler(signum, frame):
|
||||
self._sout('timeout reached')
|
||||
raise TimeoutError()
|
||||
|
||||
signal.signal(signal.SIGALRM, handler)
|
||||
protocol = self.protocol
|
||||
n_jobs = self.n_jobs
|
||||
|
||||
self.param_scores_ = {}
|
||||
self.best_score_ = None
|
||||
some_timeouts = False
|
||||
for values in itertools.product(*params_values):
|
||||
params = dict({k: values[i] for i, k in enumerate(params_keys)})
|
||||
|
||||
if self.timeout > 0:
|
||||
signal.alarm(self.timeout)
|
||||
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
|
||||
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
|
||||
|
||||
try:
|
||||
# overrides default parameters with the parameters being explored at this iteration
|
||||
model.set_params(**params)
|
||||
model.fit(training)
|
||||
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
|
||||
score = self.error(true_prevalences, estim_prevalences)
|
||||
|
||||
self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
|
||||
for params, score, model in scores:
|
||||
if score is not None:
|
||||
if self.best_score_ is None or score < self.best_score_:
|
||||
self.best_score_ = score
|
||||
self.best_params_ = params
|
||||
self.best_model_ = deepcopy(model)
|
||||
self.best_model_ = model
|
||||
self.param_scores_[str(params)] = score
|
||||
else:
|
||||
self.param_scores_[str(params)] = 'timeout'
|
||||
|
||||
if self.timeout > 0:
|
||||
signal.alarm(0)
|
||||
except TimeoutError:
|
||||
print(f'timeout reached for config {params}')
|
||||
some_timeouts = True
|
||||
|
||||
if self.best_score_ is None and some_timeouts:
|
||||
if self.best_score_ is None:
|
||||
raise TimeoutError('all jobs took more than the timeout time to end')
|
||||
|
||||
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
||||
|
||||
if self.refit:
|
||||
self._sout(f'refitting on the whole development set')
|
||||
self.best_model_.fit(training + val_split)
|
||||
if isinstance(protocol, OnLabelledCollectionProtocol):
|
||||
self._sout(f'refitting on the whole development set')
|
||||
self.best_model_.fit(training + protocol.get_labelled_collection())
|
||||
else:
|
||||
raise RuntimeWarning(f'"refit" was requested, but the protocol does not '
|
||||
f'implement the {OnLabelledCollectionProtocol.__name__} interface')
|
||||
|
||||
return self
|
||||
|
||||
def _delayed_eval(self, args):
|
||||
params, training = args
|
||||
|
||||
protocol = self.protocol
|
||||
error = self.error
|
||||
|
||||
if self.timeout > 0:
|
||||
def handler(signum, frame):
|
||||
raise TimeoutError()
|
||||
|
||||
signal.signal(signal.SIGALRM, handler)
|
||||
|
||||
tinit = time()
|
||||
|
||||
if self.timeout > 0:
|
||||
signal.alarm(self.timeout)
|
||||
|
||||
try:
|
||||
model = deepcopy(self.model)
|
||||
# overrides default parameters with the parameters being explored at this iteration
|
||||
model.set_params(**params)
|
||||
model.fit(training)
|
||||
score = evaluation.evaluate(model, protocol=protocol, error_metric=error)
|
||||
|
||||
ttime = time()-tinit
|
||||
self._sout(f'hyperparams={params}\t got {error.__name__} score {score:.5f} [took {ttime:.4f}s]')
|
||||
|
||||
if self.timeout > 0:
|
||||
signal.alarm(0)
|
||||
except TimeoutError:
|
||||
self._sout(f'timeout ({self.timeout}s) reached for config {params}')
|
||||
score = None
|
||||
|
||||
return params, score, model
|
||||
|
||||
|
||||
def quantify(self, instances):
|
||||
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
|
||||
|
||||
|
@ -227,14 +155,6 @@ class GridSearchQ(BaseQuantifier):
|
|||
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
||||
return self.best_model().quantify(instances)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
"""
|
||||
Classes on which the quantifier has been trained on.
|
||||
:return: a ndarray of shape `(n_classes)` with the class identifiers
|
||||
"""
|
||||
return self.best_model().classes_
|
||||
|
||||
def set_params(self, **parameters):
|
||||
"""Sets the hyper-parameters to explore.
|
||||
|
||||
|
@ -260,3 +180,5 @@ class GridSearchQ(BaseQuantifier):
|
|||
if hasattr(self, 'best_model_'):
|
||||
return self.best_model_
|
||||
raise ValueError('best_model called before fit')
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
from copy import deepcopy
|
||||
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import itertools
|
||||
from collections.abc import Generator
|
||||
from contextlib import ExitStack
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
from tqdm import tqdm
|
||||
from os.path import exists
|
||||
from glob import glob
|
||||
|
||||
|
||||
# 0.1.7
|
||||
|
@ -61,6 +65,8 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
|||
the sequence will be different every time the protocol is called.
|
||||
"""
|
||||
|
||||
_random_seed = -1 # means "not set"
|
||||
|
||||
def __init__(self, seed=None):
|
||||
self.random_seed = seed
|
||||
|
||||
|
@ -93,13 +99,47 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
|||
|
||||
def __call__(self):
|
||||
with ExitStack() as stack:
|
||||
if self.random_seed == -1:
|
||||
raise ValueError('The random seed has never been initialized. '
|
||||
'Set it to None not to impose replicability.')
|
||||
if self.random_seed is not None:
|
||||
stack.enter_context(qp.util.temp_seed(self.random_seed))
|
||||
for params in self.samples_parameters():
|
||||
yield self.sample(params)
|
||||
|
||||
|
||||
class APP(AbstractStochasticSeededProtocol):
|
||||
class OnLabelledCollectionProtocol:
|
||||
def get_labelled_collection(self):
|
||||
return self.data
|
||||
|
||||
def on_preclassified_instances(self, pre_classifications, in_place=False):
|
||||
assert len(pre_classifications) == len(self.data), \
|
||||
f'error: the pre-classified data has different shape ' \
|
||||
f'(expected {len(self.data)}, found {len(pre_classifications)})'
|
||||
if in_place:
|
||||
self.data.instances = pre_classifications
|
||||
return self
|
||||
else:
|
||||
new = deepcopy(self)
|
||||
return new.on_preclassified_instances(pre_classifications, in_place=True)
|
||||
|
||||
|
||||
class LoadSamplesFromDirectory(AbstractProtocol):
|
||||
|
||||
def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
|
||||
assert exists(folder_path), f'folder {folder_path} does not exist'
|
||||
assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
|
||||
self.folder_path = folder_path
|
||||
self.loader_fn = loader_fn
|
||||
self.classes = classes
|
||||
self.loader_kwargs = loader_kwargs
|
||||
|
||||
def __call__(self):
|
||||
for file in sorted(glob(self.folder_path, '*')):
|
||||
yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
|
||||
|
||||
|
||||
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||
"""
|
||||
Implementation of the artificial prevalence protocol (APP).
|
||||
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
|
||||
|
@ -123,7 +163,7 @@ class APP(AbstractStochasticSeededProtocol):
|
|||
self.n_prevalences = n_prevalences
|
||||
self.repeats = repeats
|
||||
|
||||
def prevalence_grid(self, dimensions):
|
||||
def prevalence_grid(self):
|
||||
"""
|
||||
Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
|
||||
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
|
||||
|
@ -134,14 +174,14 @@ class APP(AbstractStochasticSeededProtocol):
|
|||
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
|
||||
1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
|
||||
|
||||
:param dimensions: the number of classes
|
||||
:return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
|
||||
`(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
|
||||
in the grid multiplied by `repeat`
|
||||
"""
|
||||
dimensions = self.data.n_classes
|
||||
s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
|
||||
s = [s] * (dimensions - 1)
|
||||
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
|
||||
prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) <= 1.0)]
|
||||
prevs = np.asarray(prevs).reshape(len(prevs), -1)
|
||||
if self.repeats > 1:
|
||||
prevs = np.repeat(prevs, self.repeats, axis=0)
|
||||
|
@ -149,8 +189,8 @@ class APP(AbstractStochasticSeededProtocol):
|
|||
|
||||
def samples_parameters(self):
|
||||
indexes = []
|
||||
for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
|
||||
index = data.sampling_index(self.sample_size, *prevs)
|
||||
for prevs in self.prevalence_grid():
|
||||
index = self.data.sampling_index(self.sample_size, *prevs)
|
||||
indexes.append(index)
|
||||
return indexes
|
||||
|
||||
|
@ -161,7 +201,7 @@ class APP(AbstractStochasticSeededProtocol):
|
|||
return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
|
||||
|
||||
|
||||
class NPP(AbstractStochasticSeededProtocol):
|
||||
class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||
"""
|
||||
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
|
||||
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
|
||||
|
@ -182,7 +222,7 @@ class NPP(AbstractStochasticSeededProtocol):
|
|||
def samples_parameters(self):
|
||||
indexes = []
|
||||
for _ in range(self.repeats):
|
||||
index = data.uniform_sampling_index(self.sample_size)
|
||||
index = self.data.uniform_sampling_index(self.sample_size)
|
||||
indexes.append(index)
|
||||
return indexes
|
||||
|
||||
|
@ -193,8 +233,7 @@ class NPP(AbstractStochasticSeededProtocol):
|
|||
return self.repeats
|
||||
|
||||
|
||||
|
||||
class USimplexPP(AbstractStochasticSeededProtocol):
|
||||
class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||
"""
|
||||
A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
|
||||
relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
|
||||
|
@ -218,8 +257,8 @@ class USimplexPP(AbstractStochasticSeededProtocol):
|
|||
|
||||
def samples_parameters(self):
|
||||
indexes = []
|
||||
for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
|
||||
index = data.sampling_index(self.sample_size, *prevs)
|
||||
for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats):
|
||||
index = self.data.sampling_index(self.sample_size, *prevs)
|
||||
indexes.append(index)
|
||||
return indexes
|
||||
|
||||
|
@ -230,7 +269,6 @@ class USimplexPP(AbstractStochasticSeededProtocol):
|
|||
return self.repeats
|
||||
|
||||
|
||||
|
||||
class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
||||
"""
|
||||
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
|
||||
|
@ -300,33 +338,3 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
|||
return self.repeats * len(self.mixture_points)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
import numpy as np
|
||||
import quapy as qp
|
||||
|
||||
# domainA
|
||||
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
|
||||
X = ['A:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
|
||||
data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
|
||||
|
||||
# domain B
|
||||
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
|
||||
X = ['B:'+str(i)+'-'+str(yi) for i, yi in enumerate(y)]
|
||||
dataB = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
|
||||
|
||||
# p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
|
||||
# p = NPP(data, sample_size=10, repeats=10, random_seed=42)
|
||||
# p = NPP(data, sample_size=10, repeats=10)
|
||||
# p = USimplexPP(data, sample_size=10, repeats=10)
|
||||
p = CovariateShiftPP(data, dataB, sample_size=10, mixture_points=11, random_seed=1)
|
||||
|
||||
for _ in range(2):
|
||||
print('init generator', p.__class__.__name__)
|
||||
for i in tqdm(p(), total=p.total()):
|
||||
# print(i)
|
||||
print(i.instances, i.labels, i.prevalence())
|
||||
|
||||
print('done')
|
||||
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
import unittest
|
||||
import quapy as qp
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from time import time
|
||||
from method.aggregative import EMQ
|
||||
from method.base import BaseQuantifier
|
||||
|
||||
|
||||
class EvalTestCase(unittest.TestCase):
|
||||
def test_eval_speedup(self):
|
||||
|
||||
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
|
||||
train, test = data.training, data.test
|
||||
|
||||
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
|
||||
|
||||
class SlowLR(LogisticRegression):
|
||||
def predict_proba(self, X):
|
||||
import time
|
||||
time.sleep(1)
|
||||
return super().predict_proba(X)
|
||||
|
||||
emq = EMQ(SlowLR()).fit(train)
|
||||
|
||||
tinit = time()
|
||||
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
|
||||
tend_optim = time()-tinit
|
||||
print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
|
||||
|
||||
class NonAggregativeEMQ(BaseQuantifier):
|
||||
|
||||
def __init__(self, cls):
|
||||
self.emq = EMQ(cls)
|
||||
|
||||
def quantify(self, instances):
|
||||
return self.emq.quantify(instances)
|
||||
|
||||
def fit(self, data):
|
||||
self.emq.fit(data)
|
||||
return self
|
||||
|
||||
def set_params(self, **parameters): pass
|
||||
def get_params(self, deep=True): pass
|
||||
|
||||
|
||||
emq = NonAggregativeEMQ(SlowLR()).fit(train)
|
||||
|
||||
tinit = time()
|
||||
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
|
||||
tend_no_optim = time() - tinit
|
||||
print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
|
||||
|
||||
self.assertEqual(tend_no_optim>tend_optim, True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,32 @@
|
|||
import unittest
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import quapy as qp
|
||||
from quapy.method.aggregative import *
|
||||
|
||||
|
||||
|
||||
class HierarchyTestCase(unittest.TestCase):
|
||||
|
||||
def test_aggregative(self):
|
||||
lr = LogisticRegression()
|
||||
for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]:
|
||||
self.assertEqual(isinstance(m, AggregativeQuantifier), True)
|
||||
|
||||
def test_binary(self):
|
||||
lr = LogisticRegression()
|
||||
for m in [HDy(lr)]:
|
||||
self.assertEqual(isinstance(m, BinaryQuantifier), True)
|
||||
|
||||
def test_probabilistic(self):
|
||||
lr = LogisticRegression()
|
||||
for m in [CC(lr), ACC(lr)]:
|
||||
self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), False)
|
||||
for m in [PCC(lr), PACC(lr)]:
|
||||
self.assertEqual(isinstance(m, AggregativeProbabilisticQuantifier), True)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,77 @@
|
|||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import SVC
|
||||
|
||||
import quapy as qp
|
||||
from method.aggregative import PACC
|
||||
from model_selection import GridSearchQ
|
||||
from protocol import APP
|
||||
|
||||
|
||||
class ModselTestCase(unittest.TestCase):
|
||||
|
||||
def test_modsel(self):
|
||||
|
||||
q = PACC(LogisticRegression(random_state=1, max_iter=5000))
|
||||
|
||||
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
|
||||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||
# test = data.test
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
q = GridSearchQ(
|
||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
|
||||
).fit(training)
|
||||
print('best params', q.best_params_)
|
||||
print('best score', q.best_score_)
|
||||
|
||||
self.assertEqual(q.best_params_['C'], 10.0)
|
||||
self.assertEqual(q.best_model().get_params()['C'], 10.0)
|
||||
|
||||
def test_modsel_parallel(self):
|
||||
|
||||
q = PACC(LogisticRegression(random_state=1, max_iter=5000))
|
||||
|
||||
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
|
||||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||
# test = data.test
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
q = GridSearchQ(
|
||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
|
||||
).fit(training)
|
||||
print('best params', q.best_params_)
|
||||
print('best score', q.best_score_)
|
||||
|
||||
self.assertEqual(q.best_params_['C'], 10.0)
|
||||
self.assertEqual(q.best_model().get_params()['C'], 10.0)
|
||||
|
||||
def test_modsel_timeout(self):
|
||||
|
||||
class SlowLR(LogisticRegression):
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
import time
|
||||
time.sleep(10)
|
||||
super(SlowLR, self).fit(X, y, sample_weight)
|
||||
|
||||
q = PACC(SlowLR())
|
||||
|
||||
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
|
||||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||
# test = data.test
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7)}
|
||||
app = APP(validation, sample_size=100, random_seed=1)
|
||||
q = GridSearchQ(
|
||||
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
|
||||
)
|
||||
with self.assertRaises(TimeoutError):
|
||||
q.fit(training)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,139 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from data import LabelledCollection
|
||||
from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
|
||||
|
||||
|
||||
def mock_labelled_collection(prefix=''):
|
||||
y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250
|
||||
X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)]
|
||||
return LabelledCollection(X, y, classes_=sorted(np.unique(y)))
|
||||
|
||||
|
||||
def samples_to_str(protocol):
|
||||
samples_str = ""
|
||||
for sample in protocol():
|
||||
samples_str += f'{sample.instances}\t{sample.labels}\t{sample.prevalence()}\n'
|
||||
return samples_str
|
||||
|
||||
|
||||
class TestProtocols(unittest.TestCase):
|
||||
|
||||
def test_app_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertEqual(samples1, samples2)
|
||||
|
||||
def test_app_not_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = APP(data, sample_size=5, n_prevalences=11)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertNotEqual(samples1, samples2)
|
||||
|
||||
def test_app_number(self):
|
||||
data = mock_labelled_collection()
|
||||
p = APP(data, sample_size=100, n_prevalences=10, repeats=1)
|
||||
|
||||
# surprisingly enough, for some n_prevalences the test fails, notwithstanding
|
||||
# everything is correct. The problem is that in function APP.prevalence_grid()
|
||||
# there is sometimes one rounding error that gets cumulated and
|
||||
# surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like)
|
||||
# so these tuples are mistakenly removed... I have tried with np.close, and
|
||||
# other workarounds, but eventually happens that there is some negative probability
|
||||
# in the sampling function...
|
||||
|
||||
count = 0
|
||||
for _ in p():
|
||||
count+=1
|
||||
|
||||
self.assertEqual(count, p.total())
|
||||
|
||||
def test_npp_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = NPP(data, sample_size=5, repeats=5, random_seed=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertEqual(samples1, samples2)
|
||||
|
||||
def test_npp_not_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = NPP(data, sample_size=5, repeats=5)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertNotEqual(samples1, samples2)
|
||||
|
||||
def test_kraemer_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertEqual(samples1, samples2)
|
||||
|
||||
def test_kraemer_not_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = USimplexPP(data, sample_size=5, repeats=10)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertNotEqual(samples1, samples2)
|
||||
|
||||
def test_covariate_shift_replicate(self):
|
||||
dataA = mock_labelled_collection('domA')
|
||||
dataB = mock_labelled_collection('domB')
|
||||
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertEqual(samples1, samples2)
|
||||
|
||||
def test_covariate_shift_not_replicate(self):
|
||||
dataA = mock_labelled_collection('domA')
|
||||
dataB = mock_labelled_collection('domB')
|
||||
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertNotEqual(samples1, samples2)
|
||||
|
||||
def test_no_seed_init(self):
|
||||
class NoSeedInit(AbstractStochasticSeededProtocol):
|
||||
def __init__(self):
|
||||
self.data = mock_labelled_collection()
|
||||
|
||||
def samples_parameters(self):
|
||||
# return a matrix containing sampling indexes in the rows
|
||||
return np.random.randint(0, len(self.data), 10*10).reshape(10, 10)
|
||||
|
||||
def sample(self, params):
|
||||
index = np.unique(params)
|
||||
return self.data.sampling_from_index(index)
|
||||
|
||||
p = NoSeedInit()
|
||||
|
||||
# this should raise a ValueError, since the class is said to be AbstractStochasticSeededProtocol but the
|
||||
# random_seed has never been passed to super(NoSeedInit, self).__init__(random_seed)
|
||||
with self.assertRaises(ValueError):
|
||||
for sample in p():
|
||||
pass
|
||||
print('done')
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in New Issue