diff --git a/quapy/__init__.py b/quapy/__init__.py index 300e7d3..55e8f12 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -14,7 +14,7 @@ from . import model_selection from . import classification import os -__version__ = '0.1.10r' +__version__ = '0.1.10' environ = { 'SAMPLE_SIZE': None, diff --git a/quapy/data/base.py b/quapy/data/base.py index 8ba0aec..72561e4 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from numpy.random import RandomState from quapy.functional import strprev from quapy.util import temp_seed +import functional as F class LabelledCollection: @@ -34,8 +35,7 @@ class LabelledCollection: self.labels = np.asarray(labels) n_docs = len(self) if classes is None: - self.classes_ = np.unique(self.labels) - self.classes_.sort() + self.classes_ = F.classes_from_labels(self.labels) else: self.classes_ = np.unique(np.asarray(classes)) self.classes_.sort() diff --git a/quapy/functional.py b/quapy/functional.py index fd7a88f..b508d76 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -7,6 +7,20 @@ import scipy import numpy as np +# ------------------------------------------------------------------------------------------ +# General utils +# ------------------------------------------------------------------------------------------ + +def classes_from_labels(labels): + """ + Obtains a np.ndarray with the (sorted) classes + :param labels: + :return: + """ + classes = np.unique(labels) + classes.sort() + return classes + # ------------------------------------------------------------------------------------------ # Counter utils # ------------------------------------------------------------------------------------------ diff --git a/quapy/method/_neural.py b/quapy/method/_neural.py index c2b4de6..404090f 100644 --- a/quapy/method/_neural.py +++ b/quapy/method/_neural.py @@ -149,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier): train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_) self.quantifiers = { - 'cc': CC(self.classifier).fit(None, fit_classifier=False), - 'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), - 'pcc': PCC(self.classifier).fit(None, fit_classifier=False), - 'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), + 'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), + 'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), + 'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), + 'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy), } if classifier_data is not None: - self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False) + self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy) self.status = { 'tr-loss': -1, diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py index 2c3a68c..628f01a 100644 --- a/quapy/method/_threshold_optim.py +++ b/quapy/method/_threshold_optim.py @@ -34,7 +34,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier): """ def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None): - super.__init__(classifier, fit_classifier, val_split) + super().__init__(classifier, fit_classifier, val_split) self.n_jobs = qp._get_njobs(n_jobs) @abstractmethod diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index cda6294..e890be9 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -100,7 +100,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC): # consistency checks: fit_classifier? if self.fit_classifier: if fitted: - raise RuntimeWarning(f'the classifier is already fitted, by {fit_classifier=} was requested') + raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested') else: assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, ' f'but this does not seem to be') @@ -158,7 +158,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC): predictions, labels = None, None if isinstance(self.val_split, int): - assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}' + assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}' num_folds = self.val_split n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method()) @@ -717,7 +717,7 @@ class EMQ(AggregativeSoftQuantifier): super().__init__(classifier, fit_classifier, val_split) self.exact_train_prev = exact_train_prev self.calib = calib - self.on_calib_errors = on_calib_error + self.on_calib_error = on_calib_error self.n_jobs = n_jobs @classmethod @@ -790,9 +790,9 @@ class EMQ(AggregativeSoftQuantifier): try: self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True) except Exception as e: - if self.on_calib_errors == 'raise': + if self.on_calib_error == 'raise': raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}') - elif self.on_calib_errors == 'backup': + elif self.on_calib_error == 'backup': self.calibration_function = lambda P: P def _calibrate_if_requested(self, uncalib_posteriors): @@ -800,12 +800,12 @@ class EMQ(AggregativeSoftQuantifier): try: calib_posteriors = self.calibration_function(uncalib_posteriors) except Exception as e: - if self.on_calib_errors == 'raise': + if self.on_calib_error == 'raise': raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}') - elif self.on_calib_errors == 'backup': + elif self.on_calib_error == 'backup': calib_posteriors = uncalib_posteriors else: - raise ValueError(f'unexpected {self.on_calib_errors=}; ' + raise ValueError(f'unexpected {self.on_calib_error=}; ' f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}') return calib_posteriors return uncalib_posteriors diff --git a/quapy/method/base.py b/quapy/method/base.py index a2dcfe0..1d7ad34 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -46,7 +46,7 @@ class BaseQuantifier(BaseEstimator): :param X: array-like :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. """ - ... + return self.predict(X) class BinaryQuantifier(BaseQuantifier): diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 77660f1..f54768c 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -450,8 +450,13 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be the one indicated in `qp.environ['DEFAULT_CLS']` - :param val_split: a float in (0, 1) indicating the proportion of the training data to be used, - as a stratified held-out validation set, for generating classifier predictions. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. :param num_warmup: number of warmup iterations for the MCMC sampler (default 500) :param num_samples: number of samples to draw from the posterior (default 1000) :param mcmc_seed: random seed for the MCMC sampler (default 0) @@ -462,6 +467,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ def __init__(self, classifier: BaseEstimator=None, + fit_classifier=True, val_split: int = 5, num_warmup: int = 500, num_samples: int = 1_000, @@ -474,14 +480,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): if num_samples <= 0: raise ValueError(f'parameter {num_samples=} must be a positive integer') - # if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1: - # raise ValueError(f'val_split must be a float in (0, 1), got {val_split}') - if _bayesian.DEPENDENCIES_INSTALLED is False: - raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") + raise ImportError("Auxiliary dependencies are required. " + "Run `$ pip install quapy[bayes]` to install them.") - self.classifier = qp._get_classifier(classifier) - self.val_split = val_split + super().__init__(classifier, fit_classifier, val_split) self.num_warmup = num_warmup self.num_samples = num_samples self.mcmc_seed = mcmc_seed @@ -505,8 +508,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ pred_labels = classif_predictions true_labels = labels - self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, - labels=self.classifier.classes_) + self._n_and_c_labeled = confusion_matrix( + y_true=true_labels, + y_pred=pred_labels, + labels=self.classifier.classes_ + ).astype(float) def sample_from_posterior(self, classif_predictions): if self._n_and_c_labeled is None: diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 3e9ce4c..17c9903 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -414,15 +414,15 @@ def _delayed_new_instance(args): sample = data.sampling_from_index(sample_index) if val_split is not None: - model.fit(sample, val_split=val_split) + model.fit(*sample.Xy, val_split=val_split) else: - model.fit(sample) + model.fit(*sample.Xy) tr_prevalence = sample.prevalence() tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None if verbose: - print(f'\t\--fit-ended for prev {F.strprev(prev)}') + print(f'\t--fit-ended for prev {F.strprev(prev)}') return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 00bdbed..eff2283 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -20,14 +20,16 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): def __init__(self): self._classes_ = None - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ Computes the training prevalence and stores it. - :param data: the training sample + :param X: array-like of shape `(n_samples, n_features)`, the training instances + :param y: array-like of shape `(n_samples,)`, the labels :return: self """ - self.estimated_prevalence = data.prevalence() + self._classes_ = F.classes_from_labels(labels=y) + self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_) return self def predict(self, X): @@ -114,9 +116,10 @@ class DMx(BaseQuantifier): """ self.nfeats = X.shape[1] self.feat_ranges = _get_features_range(X) + n_classes = len(np.unique(y)) self.validation_distribution = np.asarray( - [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] + [self.__get_distributions(X[y==cat]) for cat in range(n_classes)] ) return self diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index aa609bc..f2d9cdd 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -64,7 +64,7 @@ class TestMethods(unittest.TestCase): q = model() print(f'testing {q} on dataset {dataset.name}') - q.fit(dataset.training) + q.fit(*dataset.training.Xy) estim_prevalences = q.predict(dataset.test.X) self.assertTrue(check_prevalence_vector(estim_prevalences)) @@ -80,7 +80,7 @@ class TestMethods(unittest.TestCase): print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}') ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) - ensemble.fit(dataset.training) + ensemble.fit(*dataset.training.Xy) estim_prevalences = ensemble.predict(dataset.test.instances) self.assertTrue(check_prevalence_vector(estim_prevalences)) @@ -116,6 +116,7 @@ class TestMethods(unittest.TestCase): print('testing', q) q.fit(*dataset.training.Xy) estim_prevalences = q.predict(dataset.test.X) + print(estim_prevalences) self.assertTrue(check_prevalence_vector(estim_prevalences))