refactoring codebase

This commit is contained in:
Alejandro Moreo Fernandez 2025-05-23 12:27:49 +02:00
parent 48defb4261
commit e76e1de6a9
11 changed files with 61 additions and 37 deletions

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification
import os
__version__ = '0.1.10r'
__version__ = '0.1.10'
environ = {
'SAMPLE_SIZE': None,

View File

@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState
from quapy.functional import strprev
from quapy.util import temp_seed
import functional as F
class LabelledCollection:
@ -34,8 +35,7 @@ class LabelledCollection:
self.labels = np.asarray(labels)
n_docs = len(self)
if classes is None:
self.classes_ = np.unique(self.labels)
self.classes_.sort()
self.classes_ = F.classes_from_labels(self.labels)
else:
self.classes_ = np.unique(np.asarray(classes))
self.classes_.sort()

View File

@ -7,6 +7,20 @@ import scipy
import numpy as np
# ------------------------------------------------------------------------------------------
# General utils
# ------------------------------------------------------------------------------------------
def classes_from_labels(labels):
"""
Obtains a np.ndarray with the (sorted) classes
:param labels:
:return:
"""
classes = np.unique(labels)
classes.sort()
return classes
# ------------------------------------------------------------------------------------------
# Counter utils
# ------------------------------------------------------------------------------------------

View File

@ -149,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier):
train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = {
'cc': CC(self.classifier).fit(None, fit_classifier=False),
'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
'pcc': PCC(self.classifier).fit(None, fit_classifier=False),
'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
}
if classifier_data is not None:
self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False)
self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy)
self.status = {
'tr-loss': -1,

View File

@ -34,7 +34,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
"""
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None):
super.__init__(classifier, fit_classifier, val_split)
super().__init__(classifier, fit_classifier, val_split)
self.n_jobs = qp._get_njobs(n_jobs)
@abstractmethod

View File

@ -100,7 +100,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
# consistency checks: fit_classifier?
if self.fit_classifier:
if fitted:
raise RuntimeWarning(f'the classifier is already fitted, by {fit_classifier=} was requested')
raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested')
else:
assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
f'but this does not seem to be')
@ -158,7 +158,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
predictions, labels = None, None
if isinstance(self.val_split, int):
assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}'
assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}'
num_folds = self.val_split
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method())
@ -717,7 +717,7 @@ class EMQ(AggregativeSoftQuantifier):
super().__init__(classifier, fit_classifier, val_split)
self.exact_train_prev = exact_train_prev
self.calib = calib
self.on_calib_errors = on_calib_error
self.on_calib_error = on_calib_error
self.n_jobs = n_jobs
@classmethod
@ -790,9 +790,9 @@ class EMQ(AggregativeSoftQuantifier):
try:
self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True)
except Exception as e:
if self.on_calib_errors == 'raise':
if self.on_calib_error == 'raise':
raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}')
elif self.on_calib_errors == 'backup':
elif self.on_calib_error == 'backup':
self.calibration_function = lambda P: P
def _calibrate_if_requested(self, uncalib_posteriors):
@ -800,12 +800,12 @@ class EMQ(AggregativeSoftQuantifier):
try:
calib_posteriors = self.calibration_function(uncalib_posteriors)
except Exception as e:
if self.on_calib_errors == 'raise':
if self.on_calib_error == 'raise':
raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}')
elif self.on_calib_errors == 'backup':
elif self.on_calib_error == 'backup':
calib_posteriors = uncalib_posteriors
else:
raise ValueError(f'unexpected {self.on_calib_errors=}; '
raise ValueError(f'unexpected {self.on_calib_error=}; '
f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}')
return calib_posteriors
return uncalib_posteriors

View File

@ -46,7 +46,7 @@ class BaseQuantifier(BaseEstimator):
:param X: array-like
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
"""
...
return self.predict(X)
class BinaryQuantifier(BaseQuantifier):

View File

@ -450,8 +450,13 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']`
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used,
as a stratified held-out validation set, for generating classifier predictions.
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
None when the method does not require any validation data, in order to avoid that some portion of
the training data be wasted.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0)
@ -462,6 +467,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
"""
def __init__(self,
classifier: BaseEstimator=None,
fit_classifier=True,
val_split: int = 5,
num_warmup: int = 500,
num_samples: int = 1_000,
@ -474,14 +480,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer')
# if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
# raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
raise ImportError("Auxiliary dependencies are required. "
"Run `$ pip install quapy[bayes]` to install them.")
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
super().__init__(classifier, fit_classifier, val_split)
self.num_warmup = num_warmup
self.num_samples = num_samples
self.mcmc_seed = mcmc_seed
@ -505,8 +508,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
"""
pred_labels = classif_predictions
true_labels = labels
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels,
labels=self.classifier.classes_)
self._n_and_c_labeled = confusion_matrix(
y_true=true_labels,
y_pred=pred_labels,
labels=self.classifier.classes_
).astype(float)
def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None:

View File

@ -414,15 +414,15 @@ def _delayed_new_instance(args):
sample = data.sampling_from_index(sample_index)
if val_split is not None:
model.fit(sample, val_split=val_split)
model.fit(*sample.Xy, val_split=val_split)
else:
model.fit(sample)
model.fit(*sample.Xy)
tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose:
print(f'\t\--fit-ended for prev {F.strprev(prev)}')
print(f'\t--fit-ended for prev {F.strprev(prev)}')
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)

View File

@ -20,14 +20,16 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def __init__(self):
self._classes_ = None
def fit(self, data: LabelledCollection):
def fit(self, X, y):
"""
Computes the training prevalence and stores it.
:param data: the training sample
:param X: array-like of shape `(n_samples, n_features)`, the training instances
:param y: array-like of shape `(n_samples,)`, the labels
:return: self
"""
self.estimated_prevalence = data.prevalence()
self._classes_ = F.classes_from_labels(labels=y)
self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_)
return self
def predict(self, X):
@ -114,9 +116,10 @@ class DMx(BaseQuantifier):
"""
self.nfeats = X.shape[1]
self.feat_ranges = _get_features_range(X)
n_classes = len(np.unique(y))
self.validation_distribution = np.asarray(
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
[self.__get_distributions(X[y==cat]) for cat in range(n_classes)]
)
return self

View File

@ -64,7 +64,7 @@ class TestMethods(unittest.TestCase):
q = model()
print(f'testing {q} on dataset {dataset.name}')
q.fit(dataset.training)
q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X)
self.assertTrue(check_prevalence_vector(estim_prevalences))
@ -80,7 +80,7 @@ class TestMethods(unittest.TestCase):
print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
ensemble.fit(dataset.training)
ensemble.fit(*dataset.training.Xy)
estim_prevalences = ensemble.predict(dataset.test.instances)
self.assertTrue(check_prevalence_vector(estim_prevalences))
@ -116,6 +116,7 @@ class TestMethods(unittest.TestCase):
print('testing', q)
q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X)
print(estim_prevalences)
self.assertTrue(check_prevalence_vector(estim_prevalences))