refactoring codebase

This commit is contained in:
Alejandro Moreo Fernandez 2025-05-23 12:27:49 +02:00
parent 48defb4261
commit e76e1de6a9
11 changed files with 61 additions and 37 deletions

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification from . import classification
import os import os
__version__ = '0.1.10r' __version__ = '0.1.10'
environ = { environ = {
'SAMPLE_SIZE': None, 'SAMPLE_SIZE': None,

View File

@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState from numpy.random import RandomState
from quapy.functional import strprev from quapy.functional import strprev
from quapy.util import temp_seed from quapy.util import temp_seed
import functional as F
class LabelledCollection: class LabelledCollection:
@ -34,8 +35,7 @@ class LabelledCollection:
self.labels = np.asarray(labels) self.labels = np.asarray(labels)
n_docs = len(self) n_docs = len(self)
if classes is None: if classes is None:
self.classes_ = np.unique(self.labels) self.classes_ = F.classes_from_labels(self.labels)
self.classes_.sort()
else: else:
self.classes_ = np.unique(np.asarray(classes)) self.classes_ = np.unique(np.asarray(classes))
self.classes_.sort() self.classes_.sort()

View File

@ -7,6 +7,20 @@ import scipy
import numpy as np import numpy as np
# ------------------------------------------------------------------------------------------
# General utils
# ------------------------------------------------------------------------------------------
def classes_from_labels(labels):
"""
Obtains a np.ndarray with the (sorted) classes
:param labels:
:return:
"""
classes = np.unique(labels)
classes.sort()
return classes
# ------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------
# Counter utils # Counter utils
# ------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------

View File

@ -149,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier):
train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_) train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = { self.quantifiers = {
'cc': CC(self.classifier).fit(None, fit_classifier=False), 'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), 'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pcc': PCC(self.classifier).fit(None, fit_classifier=False), 'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), 'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
} }
if classifier_data is not None: if classifier_data is not None:
self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False) self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy)
self.status = { self.status = {
'tr-loss': -1, 'tr-loss': -1,

View File

@ -34,7 +34,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
""" """
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None):
super.__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
@abstractmethod @abstractmethod

View File

@ -100,7 +100,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
# consistency checks: fit_classifier? # consistency checks: fit_classifier?
if self.fit_classifier: if self.fit_classifier:
if fitted: if fitted:
raise RuntimeWarning(f'the classifier is already fitted, by {fit_classifier=} was requested') raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested')
else: else:
assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, ' assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
f'but this does not seem to be') f'but this does not seem to be')
@ -158,7 +158,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
predictions, labels = None, None predictions, labels = None, None
if isinstance(self.val_split, int): if isinstance(self.val_split, int):
assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}' assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}'
num_folds = self.val_split num_folds = self.val_split
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method()) predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method())
@ -717,7 +717,7 @@ class EMQ(AggregativeSoftQuantifier):
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.exact_train_prev = exact_train_prev self.exact_train_prev = exact_train_prev
self.calib = calib self.calib = calib
self.on_calib_errors = on_calib_error self.on_calib_error = on_calib_error
self.n_jobs = n_jobs self.n_jobs = n_jobs
@classmethod @classmethod
@ -790,9 +790,9 @@ class EMQ(AggregativeSoftQuantifier):
try: try:
self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True) self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True)
except Exception as e: except Exception as e:
if self.on_calib_errors == 'raise': if self.on_calib_error == 'raise':
raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}') raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}')
elif self.on_calib_errors == 'backup': elif self.on_calib_error == 'backup':
self.calibration_function = lambda P: P self.calibration_function = lambda P: P
def _calibrate_if_requested(self, uncalib_posteriors): def _calibrate_if_requested(self, uncalib_posteriors):
@ -800,12 +800,12 @@ class EMQ(AggregativeSoftQuantifier):
try: try:
calib_posteriors = self.calibration_function(uncalib_posteriors) calib_posteriors = self.calibration_function(uncalib_posteriors)
except Exception as e: except Exception as e:
if self.on_calib_errors == 'raise': if self.on_calib_error == 'raise':
raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}') raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}')
elif self.on_calib_errors == 'backup': elif self.on_calib_error == 'backup':
calib_posteriors = uncalib_posteriors calib_posteriors = uncalib_posteriors
else: else:
raise ValueError(f'unexpected {self.on_calib_errors=}; ' raise ValueError(f'unexpected {self.on_calib_error=}; '
f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}') f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}')
return calib_posteriors return calib_posteriors
return uncalib_posteriors return uncalib_posteriors

View File

@ -46,7 +46,7 @@ class BaseQuantifier(BaseEstimator):
:param X: array-like :param X: array-like
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
""" """
... return self.predict(X)
class BinaryQuantifier(BaseQuantifier): class BinaryQuantifier(BaseQuantifier):

View File

@ -450,8 +450,13 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']` the one indicated in `qp.environ['DEFAULT_CLS']`
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used, :param val_split: specifies the data used for generating classifier predictions. This specification
as a stratified held-out validation set, for generating classifier predictions. can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
None when the method does not require any validation data, in order to avoid that some portion of
the training data be wasted.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500) :param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000) :param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0) :param mcmc_seed: random seed for the MCMC sampler (default 0)
@ -462,6 +467,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
""" """
def __init__(self, def __init__(self,
classifier: BaseEstimator=None, classifier: BaseEstimator=None,
fit_classifier=True,
val_split: int = 5, val_split: int = 5,
num_warmup: int = 500, num_warmup: int = 500,
num_samples: int = 1_000, num_samples: int = 1_000,
@ -474,14 +480,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
if num_samples <= 0: if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer') raise ValueError(f'parameter {num_samples=} must be a positive integer')
# if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
# raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
if _bayesian.DEPENDENCIES_INSTALLED is False: if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") raise ImportError("Auxiliary dependencies are required. "
"Run `$ pip install quapy[bayes]` to install them.")
self.classifier = qp._get_classifier(classifier) super().__init__(classifier, fit_classifier, val_split)
self.val_split = val_split
self.num_warmup = num_warmup self.num_warmup = num_warmup
self.num_samples = num_samples self.num_samples = num_samples
self.mcmc_seed = mcmc_seed self.mcmc_seed = mcmc_seed
@ -505,8 +508,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
""" """
pred_labels = classif_predictions pred_labels = classif_predictions
true_labels = labels true_labels = labels
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, self._n_and_c_labeled = confusion_matrix(
labels=self.classifier.classes_) y_true=true_labels,
y_pred=pred_labels,
labels=self.classifier.classes_
).astype(float)
def sample_from_posterior(self, classif_predictions): def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None: if self._n_and_c_labeled is None:

View File

@ -414,15 +414,15 @@ def _delayed_new_instance(args):
sample = data.sampling_from_index(sample_index) sample = data.sampling_from_index(sample_index)
if val_split is not None: if val_split is not None:
model.fit(sample, val_split=val_split) model.fit(*sample.Xy, val_split=val_split)
else: else:
model.fit(sample) model.fit(*sample.Xy)
tr_prevalence = sample.prevalence() tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose: if verbose:
print(f'\t\--fit-ended for prev {F.strprev(prev)}') print(f'\t--fit-ended for prev {F.strprev(prev)}')
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)

View File

@ -20,14 +20,16 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def __init__(self): def __init__(self):
self._classes_ = None self._classes_ = None
def fit(self, data: LabelledCollection): def fit(self, X, y):
""" """
Computes the training prevalence and stores it. Computes the training prevalence and stores it.
:param data: the training sample :param X: array-like of shape `(n_samples, n_features)`, the training instances
:param y: array-like of shape `(n_samples,)`, the labels
:return: self :return: self
""" """
self.estimated_prevalence = data.prevalence() self._classes_ = F.classes_from_labels(labels=y)
self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_)
return self return self
def predict(self, X): def predict(self, X):
@ -114,9 +116,10 @@ class DMx(BaseQuantifier):
""" """
self.nfeats = X.shape[1] self.nfeats = X.shape[1]
self.feat_ranges = _get_features_range(X) self.feat_ranges = _get_features_range(X)
n_classes = len(np.unique(y))
self.validation_distribution = np.asarray( self.validation_distribution = np.asarray(
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] [self.__get_distributions(X[y==cat]) for cat in range(n_classes)]
) )
return self return self

View File

@ -64,7 +64,7 @@ class TestMethods(unittest.TestCase):
q = model() q = model()
print(f'testing {q} on dataset {dataset.name}') print(f'testing {q} on dataset {dataset.name}')
q.fit(dataset.training) q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X) estim_prevalences = q.predict(dataset.test.X)
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))
@ -80,7 +80,7 @@ class TestMethods(unittest.TestCase):
print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}') print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
ensemble.fit(dataset.training) ensemble.fit(*dataset.training.Xy)
estim_prevalences = ensemble.predict(dataset.test.instances) estim_prevalences = ensemble.predict(dataset.test.instances)
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))
@ -116,6 +116,7 @@ class TestMethods(unittest.TestCase):
print('testing', q) print('testing', q)
q.fit(*dataset.training.Xy) q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X) estim_prevalences = q.predict(dataset.test.X)
print(estim_prevalences)
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))