refactoring codebase
This commit is contained in:
parent
48defb4261
commit
e76e1de6a9
|
|
@ -14,7 +14,7 @@ from . import model_selection
|
||||||
from . import classification
|
from . import classification
|
||||||
import os
|
import os
|
||||||
|
|
||||||
__version__ = '0.1.10r'
|
__version__ = '0.1.10'
|
||||||
|
|
||||||
environ = {
|
environ = {
|
||||||
'SAMPLE_SIZE': None,
|
'SAMPLE_SIZE': None,
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||||
from numpy.random import RandomState
|
from numpy.random import RandomState
|
||||||
from quapy.functional import strprev
|
from quapy.functional import strprev
|
||||||
from quapy.util import temp_seed
|
from quapy.util import temp_seed
|
||||||
|
import functional as F
|
||||||
|
|
||||||
|
|
||||||
class LabelledCollection:
|
class LabelledCollection:
|
||||||
|
|
@ -34,8 +35,7 @@ class LabelledCollection:
|
||||||
self.labels = np.asarray(labels)
|
self.labels = np.asarray(labels)
|
||||||
n_docs = len(self)
|
n_docs = len(self)
|
||||||
if classes is None:
|
if classes is None:
|
||||||
self.classes_ = np.unique(self.labels)
|
self.classes_ = F.classes_from_labels(self.labels)
|
||||||
self.classes_.sort()
|
|
||||||
else:
|
else:
|
||||||
self.classes_ = np.unique(np.asarray(classes))
|
self.classes_ = np.unique(np.asarray(classes))
|
||||||
self.classes_.sort()
|
self.classes_.sort()
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,20 @@ import scipy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------------------
|
||||||
|
# General utils
|
||||||
|
# ------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def classes_from_labels(labels):
|
||||||
|
"""
|
||||||
|
Obtains a np.ndarray with the (sorted) classes
|
||||||
|
:param labels:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
classes = np.unique(labels)
|
||||||
|
classes.sort()
|
||||||
|
return classes
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------
|
||||||
# Counter utils
|
# Counter utils
|
||||||
# ------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -149,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
|
train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
|
||||||
|
|
||||||
self.quantifiers = {
|
self.quantifiers = {
|
||||||
'cc': CC(self.classifier).fit(None, fit_classifier=False),
|
'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
|
||||||
'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
|
'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
|
||||||
'pcc': PCC(self.classifier).fit(None, fit_classifier=False),
|
'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
|
||||||
'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
|
'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
|
||||||
}
|
}
|
||||||
if classifier_data is not None:
|
if classifier_data is not None:
|
||||||
self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False)
|
self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy)
|
||||||
|
|
||||||
self.status = {
|
self.status = {
|
||||||
'tr-loss': -1,
|
'tr-loss': -1,
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None):
|
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None):
|
||||||
super.__init__(classifier, fit_classifier, val_split)
|
super().__init__(classifier, fit_classifier, val_split)
|
||||||
self.n_jobs = qp._get_njobs(n_jobs)
|
self.n_jobs = qp._get_njobs(n_jobs)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|
|
||||||
|
|
@ -100,7 +100,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
||||||
# consistency checks: fit_classifier?
|
# consistency checks: fit_classifier?
|
||||||
if self.fit_classifier:
|
if self.fit_classifier:
|
||||||
if fitted:
|
if fitted:
|
||||||
raise RuntimeWarning(f'the classifier is already fitted, by {fit_classifier=} was requested')
|
raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested')
|
||||||
else:
|
else:
|
||||||
assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
|
assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
|
||||||
f'but this does not seem to be')
|
f'but this does not seem to be')
|
||||||
|
|
@ -158,7 +158,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
||||||
|
|
||||||
predictions, labels = None, None
|
predictions, labels = None, None
|
||||||
if isinstance(self.val_split, int):
|
if isinstance(self.val_split, int):
|
||||||
assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}'
|
assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}'
|
||||||
num_folds = self.val_split
|
num_folds = self.val_split
|
||||||
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
|
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
|
||||||
predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method())
|
predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method())
|
||||||
|
|
@ -717,7 +717,7 @@ class EMQ(AggregativeSoftQuantifier):
|
||||||
super().__init__(classifier, fit_classifier, val_split)
|
super().__init__(classifier, fit_classifier, val_split)
|
||||||
self.exact_train_prev = exact_train_prev
|
self.exact_train_prev = exact_train_prev
|
||||||
self.calib = calib
|
self.calib = calib
|
||||||
self.on_calib_errors = on_calib_error
|
self.on_calib_error = on_calib_error
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -790,9 +790,9 @@ class EMQ(AggregativeSoftQuantifier):
|
||||||
try:
|
try:
|
||||||
self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True)
|
self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.on_calib_errors == 'raise':
|
if self.on_calib_error == 'raise':
|
||||||
raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}')
|
raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}')
|
||||||
elif self.on_calib_errors == 'backup':
|
elif self.on_calib_error == 'backup':
|
||||||
self.calibration_function = lambda P: P
|
self.calibration_function = lambda P: P
|
||||||
|
|
||||||
def _calibrate_if_requested(self, uncalib_posteriors):
|
def _calibrate_if_requested(self, uncalib_posteriors):
|
||||||
|
|
@ -800,12 +800,12 @@ class EMQ(AggregativeSoftQuantifier):
|
||||||
try:
|
try:
|
||||||
calib_posteriors = self.calibration_function(uncalib_posteriors)
|
calib_posteriors = self.calibration_function(uncalib_posteriors)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.on_calib_errors == 'raise':
|
if self.on_calib_error == 'raise':
|
||||||
raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}')
|
raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}')
|
||||||
elif self.on_calib_errors == 'backup':
|
elif self.on_calib_error == 'backup':
|
||||||
calib_posteriors = uncalib_posteriors
|
calib_posteriors = uncalib_posteriors
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'unexpected {self.on_calib_errors=}; '
|
raise ValueError(f'unexpected {self.on_calib_error=}; '
|
||||||
f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}')
|
f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}')
|
||||||
return calib_posteriors
|
return calib_posteriors
|
||||||
return uncalib_posteriors
|
return uncalib_posteriors
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ class BaseQuantifier(BaseEstimator):
|
||||||
:param X: array-like
|
:param X: array-like
|
||||||
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
|
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
|
||||||
"""
|
"""
|
||||||
...
|
return self.predict(X)
|
||||||
|
|
||||||
|
|
||||||
class BinaryQuantifier(BaseQuantifier):
|
class BinaryQuantifier(BaseQuantifier):
|
||||||
|
|
|
||||||
|
|
@ -450,8 +450,13 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
||||||
|
|
||||||
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
|
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
|
||||||
the one indicated in `qp.environ['DEFAULT_CLS']`
|
the one indicated in `qp.environ['DEFAULT_CLS']`
|
||||||
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used,
|
:param val_split: specifies the data used for generating classifier predictions. This specification
|
||||||
as a stratified held-out validation set, for generating classifier predictions.
|
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
|
||||||
|
be extracted from the training set; or as an integer (default 5), indicating that the predictions
|
||||||
|
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
||||||
|
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
|
||||||
|
None when the method does not require any validation data, in order to avoid that some portion of
|
||||||
|
the training data be wasted.
|
||||||
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
|
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
|
||||||
:param num_samples: number of samples to draw from the posterior (default 1000)
|
:param num_samples: number of samples to draw from the posterior (default 1000)
|
||||||
:param mcmc_seed: random seed for the MCMC sampler (default 0)
|
:param mcmc_seed: random seed for the MCMC sampler (default 0)
|
||||||
|
|
@ -462,6 +467,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
classifier: BaseEstimator=None,
|
classifier: BaseEstimator=None,
|
||||||
|
fit_classifier=True,
|
||||||
val_split: int = 5,
|
val_split: int = 5,
|
||||||
num_warmup: int = 500,
|
num_warmup: int = 500,
|
||||||
num_samples: int = 1_000,
|
num_samples: int = 1_000,
|
||||||
|
|
@ -474,14 +480,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
||||||
if num_samples <= 0:
|
if num_samples <= 0:
|
||||||
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
||||||
|
|
||||||
# if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
|
|
||||||
# raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
|
|
||||||
|
|
||||||
if _bayesian.DEPENDENCIES_INSTALLED is False:
|
if _bayesian.DEPENDENCIES_INSTALLED is False:
|
||||||
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
|
raise ImportError("Auxiliary dependencies are required. "
|
||||||
|
"Run `$ pip install quapy[bayes]` to install them.")
|
||||||
|
|
||||||
self.classifier = qp._get_classifier(classifier)
|
super().__init__(classifier, fit_classifier, val_split)
|
||||||
self.val_split = val_split
|
|
||||||
self.num_warmup = num_warmup
|
self.num_warmup = num_warmup
|
||||||
self.num_samples = num_samples
|
self.num_samples = num_samples
|
||||||
self.mcmc_seed = mcmc_seed
|
self.mcmc_seed = mcmc_seed
|
||||||
|
|
@ -505,8 +508,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
||||||
"""
|
"""
|
||||||
pred_labels = classif_predictions
|
pred_labels = classif_predictions
|
||||||
true_labels = labels
|
true_labels = labels
|
||||||
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels,
|
self._n_and_c_labeled = confusion_matrix(
|
||||||
labels=self.classifier.classes_)
|
y_true=true_labels,
|
||||||
|
y_pred=pred_labels,
|
||||||
|
labels=self.classifier.classes_
|
||||||
|
).astype(float)
|
||||||
|
|
||||||
def sample_from_posterior(self, classif_predictions):
|
def sample_from_posterior(self, classif_predictions):
|
||||||
if self._n_and_c_labeled is None:
|
if self._n_and_c_labeled is None:
|
||||||
|
|
|
||||||
|
|
@ -414,15 +414,15 @@ def _delayed_new_instance(args):
|
||||||
sample = data.sampling_from_index(sample_index)
|
sample = data.sampling_from_index(sample_index)
|
||||||
|
|
||||||
if val_split is not None:
|
if val_split is not None:
|
||||||
model.fit(sample, val_split=val_split)
|
model.fit(*sample.Xy, val_split=val_split)
|
||||||
else:
|
else:
|
||||||
model.fit(sample)
|
model.fit(*sample.Xy)
|
||||||
|
|
||||||
tr_prevalence = sample.prevalence()
|
tr_prevalence = sample.prevalence()
|
||||||
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
|
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f'\t\--fit-ended for prev {F.strprev(prev)}')
|
print(f'\t--fit-ended for prev {F.strprev(prev)}')
|
||||||
|
|
||||||
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
|
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,14 +20,16 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._classes_ = None
|
self._classes_ = None
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection):
|
def fit(self, X, y):
|
||||||
"""
|
"""
|
||||||
Computes the training prevalence and stores it.
|
Computes the training prevalence and stores it.
|
||||||
|
|
||||||
:param data: the training sample
|
:param X: array-like of shape `(n_samples, n_features)`, the training instances
|
||||||
|
:param y: array-like of shape `(n_samples,)`, the labels
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
self.estimated_prevalence = data.prevalence()
|
self._classes_ = F.classes_from_labels(labels=y)
|
||||||
|
self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def predict(self, X):
|
def predict(self, X):
|
||||||
|
|
@ -114,9 +116,10 @@ class DMx(BaseQuantifier):
|
||||||
"""
|
"""
|
||||||
self.nfeats = X.shape[1]
|
self.nfeats = X.shape[1]
|
||||||
self.feat_ranges = _get_features_range(X)
|
self.feat_ranges = _get_features_range(X)
|
||||||
|
n_classes = len(np.unique(y))
|
||||||
|
|
||||||
self.validation_distribution = np.asarray(
|
self.validation_distribution = np.asarray(
|
||||||
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
|
[self.__get_distributions(X[y==cat]) for cat in range(n_classes)]
|
||||||
)
|
)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,7 @@ class TestMethods(unittest.TestCase):
|
||||||
|
|
||||||
q = model()
|
q = model()
|
||||||
print(f'testing {q} on dataset {dataset.name}')
|
print(f'testing {q} on dataset {dataset.name}')
|
||||||
q.fit(dataset.training)
|
q.fit(*dataset.training.Xy)
|
||||||
estim_prevalences = q.predict(dataset.test.X)
|
estim_prevalences = q.predict(dataset.test.X)
|
||||||
self.assertTrue(check_prevalence_vector(estim_prevalences))
|
self.assertTrue(check_prevalence_vector(estim_prevalences))
|
||||||
|
|
||||||
|
|
@ -80,7 +80,7 @@ class TestMethods(unittest.TestCase):
|
||||||
|
|
||||||
print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
|
print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
|
||||||
ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
|
ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
|
||||||
ensemble.fit(dataset.training)
|
ensemble.fit(*dataset.training.Xy)
|
||||||
estim_prevalences = ensemble.predict(dataset.test.instances)
|
estim_prevalences = ensemble.predict(dataset.test.instances)
|
||||||
self.assertTrue(check_prevalence_vector(estim_prevalences))
|
self.assertTrue(check_prevalence_vector(estim_prevalences))
|
||||||
|
|
||||||
|
|
@ -116,6 +116,7 @@ class TestMethods(unittest.TestCase):
|
||||||
print('testing', q)
|
print('testing', q)
|
||||||
q.fit(*dataset.training.Xy)
|
q.fit(*dataset.training.Xy)
|
||||||
estim_prevalences = q.predict(dataset.test.X)
|
estim_prevalences = q.predict(dataset.test.X)
|
||||||
|
print(estim_prevalences)
|
||||||
self.assertTrue(check_prevalence_vector(estim_prevalences))
|
self.assertTrue(check_prevalence_vector(estim_prevalences))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue