From 4cfb97c165d1da56e4d11fe6f4f385b59a5e7d5c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sun, 15 Jun 2025 11:59:32 +0200 Subject: [PATCH] merging with office branch --- CHANGE_LOG.txt | 8 ++++---- quapy/__init__.py | 2 +- quapy/data/datasets.py | 5 ----- quapy/method/_threshold_optim.py | 2 +- quapy/method/aggregative.py | 12 ++++++------ quapy/method/confidence.py | 17 +++++++++++++---- quapy/tests/test_datasets.py | 1 - quapy/tests/test_methods.py | 2 +- 8 files changed, 26 insertions(+), 23 deletions(-) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index 3c43e5d..c3880e7 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -4,7 +4,7 @@ Change Log 0.1.10 CLEAN TODO-FILE - Base code Refactor: - - Removing coupling between LabelledCollection and quantification methods. E.g.: + - Removing coupling between LabelledCollection and quantification methods; the fit interface changes: def fit(data:LabelledCollection): -> def fit(X, y): - Adding function "predict" (function "quantify" is still present as an alias) - Aggregative methods's behavior in terms of fit_classifier and how to treat the val_split is now @@ -14,13 +14,13 @@ CLEAN TODO-FILE in which case the first argument is unused, and this was ambiguous with my_acc.fit(the_data, fit_classifier=False) in which case the_data is to be used for validation purposes. However, the val_split could be set as a fraction - indicating only part of the_data must be used for validation, and the rest wasted... it was confusing. + indicating only part of the_data must be used for validation, and the rest wasted... it was certainly confusing. - EMQ has been modified, so that the representation function "classify" now only provides posterior probabilities and, if required, these are recalibrated (e.g., by "bcts") during the aggregation function. - A new parameter "on_calib_error" is passed to the constructor, which informs of the policy to follow - in case the calibration functions failed. Options include: + in case the abstention's calibration functions failed (which happens sometimes). Options include: - 'raise': raises a RuntimeException (default) - - 'backup': avoids calibration + - 'backup': reruns avoiding calibration - Parameter "recalib" has been renamed "calib" - Added aggregative bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers. diff --git a/quapy/__init__.py b/quapy/__init__.py index 300e7d3..90f7a70 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -14,7 +14,7 @@ from . import model_selection from . import classification import os -__version__ = '0.1.10r' +__version__ = '0.2.0' environ = { 'SAMPLE_SIZE': None, diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5582a58..b7fd81a 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -548,25 +548,20 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, standardize= """ if name == "acute.a": X, y = data["X"], data["y"][:, 0] - # X, y = Xy[:, :-2], Xy[:, -2] elif name == "acute.b": X, y = data["X"], data["y"][:, 1] - # X, y = Xy[:, :-2], Xy[:, -1] elif name == "wine-q-red": X, y, color = data["X"], data["y"], data["color"] - # X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1] red_idx = color == "red" X, y = X[red_idx, :], y[red_idx] y = (y > 5).astype(int) elif name == "wine-q-white": X, y, color = data["X"], data["y"], data["color"] - # X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1] white_idx = color == "white" X, y = X[white_idx, :], y[white_idx] y = (y > 5).astype(int) else: X, y = data["X"], data["y"] - # X, y = Xy[:, :-1], Xy[:, -1] y = binarize(y, pos_class=pos_class[name]) diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py index 2c3a68c..628f01a 100644 --- a/quapy/method/_threshold_optim.py +++ b/quapy/method/_threshold_optim.py @@ -34,7 +34,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier): """ def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None): - super.__init__(classifier, fit_classifier, val_split) + super().__init__(classifier, fit_classifier, val_split) self.n_jobs = qp._get_njobs(n_jobs) @abstractmethod diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index cda6294..0be9fb1 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -717,7 +717,7 @@ class EMQ(AggregativeSoftQuantifier): super().__init__(classifier, fit_classifier, val_split) self.exact_train_prev = exact_train_prev self.calib = calib - self.on_calib_errors = on_calib_error + self.on_calib_error = on_calib_error self.n_jobs = n_jobs @classmethod @@ -790,9 +790,9 @@ class EMQ(AggregativeSoftQuantifier): try: self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True) except Exception as e: - if self.on_calib_errors == 'raise': + if self.on_calib_error == 'raise': raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}') - elif self.on_calib_errors == 'backup': + elif self.on_calib_error == 'backup': self.calibration_function = lambda P: P def _calibrate_if_requested(self, uncalib_posteriors): @@ -800,12 +800,12 @@ class EMQ(AggregativeSoftQuantifier): try: calib_posteriors = self.calibration_function(uncalib_posteriors) except Exception as e: - if self.on_calib_errors == 'raise': + if self.on_calib_error == 'raise': raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}') - elif self.on_calib_errors == 'backup': + elif self.on_calib_error == 'backup': calib_posteriors = uncalib_posteriors else: - raise ValueError(f'unexpected {self.on_calib_errors=}; ' + raise ValueError(f'unexpected {self.on_calib_error=}; ' f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}') return calib_posteriors return uncalib_posteriors diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 77660f1..f68f956 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -450,8 +450,17 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be the one indicated in `qp.environ['DEFAULT_CLS']` - :param val_split: a float in (0, 1) indicating the proportion of the training data to be used, - as a stratified held-out validation set, for generating classifier predictions. + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. + This hyperparameter is only meant to be used when the heuristics are to be applied, i.e., if a + calibration is required. The default value is None (meaning the calibration is not required). In + case this hyperparameter is set to a value other than None, but the calibration is not required + (calib=None), a warning message will be raised. :param num_warmup: number of warmup iterations for the MCMC sampler (default 500) :param num_samples: number of samples to draw from the posterior (default 1000) :param mcmc_seed: random seed for the MCMC sampler (default 0) @@ -462,6 +471,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ def __init__(self, classifier: BaseEstimator=None, + fit_classifier=True, val_split: int = 5, num_warmup: int = 500, num_samples: int = 1_000, @@ -480,8 +490,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): if _bayesian.DEPENDENCIES_INSTALLED is False: raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") - self.classifier = qp._get_classifier(classifier) - self.val_split = val_split + super().__init__(classifier, fit_classifier, val_split) self.num_warmup = num_warmup self.num_samples = num_samples self.mcmc_seed = mcmc_seed diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index 63c6ef8..a1910d5 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -106,7 +106,6 @@ class TestDatasets(unittest.TestCase): self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf) self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf) - def test_IFCB(self): if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index aa609bc..c2931b9 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -64,7 +64,7 @@ class TestMethods(unittest.TestCase): q = model() print(f'testing {q} on dataset {dataset.name}') - q.fit(dataset.training) + q.fit(*dataset.training.Xy) estim_prevalences = q.predict(dataset.test.X) self.assertTrue(check_prevalence_vector(estim_prevalences))