merging with office branch

This commit is contained in:
Alejandro Moreo Fernandez 2025-06-15 11:59:32 +02:00
parent 48defb4261
commit 4cfb97c165
8 changed files with 26 additions and 23 deletions

View File

@ -4,7 +4,7 @@ Change Log 0.1.10
CLEAN TODO-FILE CLEAN TODO-FILE
- Base code Refactor: - Base code Refactor:
- Removing coupling between LabelledCollection and quantification methods. E.g.: - Removing coupling between LabelledCollection and quantification methods; the fit interface changes:
def fit(data:LabelledCollection): -> def fit(X, y): def fit(data:LabelledCollection): -> def fit(X, y):
- Adding function "predict" (function "quantify" is still present as an alias) - Adding function "predict" (function "quantify" is still present as an alias)
- Aggregative methods's behavior in terms of fit_classifier and how to treat the val_split is now - Aggregative methods's behavior in terms of fit_classifier and how to treat the val_split is now
@ -14,13 +14,13 @@ CLEAN TODO-FILE
in which case the first argument is unused, and this was ambiguous with in which case the first argument is unused, and this was ambiguous with
my_acc.fit(the_data, fit_classifier=False) my_acc.fit(the_data, fit_classifier=False)
in which case the_data is to be used for validation purposes. However, the val_split could be set as a fraction in which case the_data is to be used for validation purposes. However, the val_split could be set as a fraction
indicating only part of the_data must be used for validation, and the rest wasted... it was confusing. indicating only part of the_data must be used for validation, and the rest wasted... it was certainly confusing.
- EMQ has been modified, so that the representation function "classify" now only provides posterior - EMQ has been modified, so that the representation function "classify" now only provides posterior
probabilities and, if required, these are recalibrated (e.g., by "bcts") during the aggregation function. probabilities and, if required, these are recalibrated (e.g., by "bcts") during the aggregation function.
- A new parameter "on_calib_error" is passed to the constructor, which informs of the policy to follow - A new parameter "on_calib_error" is passed to the constructor, which informs of the policy to follow
in case the calibration functions failed. Options include: in case the abstention's calibration functions failed (which happens sometimes). Options include:
- 'raise': raises a RuntimeException (default) - 'raise': raises a RuntimeException (default)
- 'backup': avoids calibration - 'backup': reruns avoiding calibration
- Parameter "recalib" has been renamed "calib" - Parameter "recalib" has been renamed "calib"
- Added aggregative bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or - Added aggregative bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or
ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers. ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers.

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification from . import classification
import os import os
__version__ = '0.1.10r' __version__ = '0.2.0'
environ = { environ = {
'SAMPLE_SIZE': None, 'SAMPLE_SIZE': None,

View File

@ -548,25 +548,20 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, standardize=
""" """
if name == "acute.a": if name == "acute.a":
X, y = data["X"], data["y"][:, 0] X, y = data["X"], data["y"][:, 0]
# X, y = Xy[:, :-2], Xy[:, -2]
elif name == "acute.b": elif name == "acute.b":
X, y = data["X"], data["y"][:, 1] X, y = data["X"], data["y"][:, 1]
# X, y = Xy[:, :-2], Xy[:, -1]
elif name == "wine-q-red": elif name == "wine-q-red":
X, y, color = data["X"], data["y"], data["color"] X, y, color = data["X"], data["y"], data["color"]
# X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1]
red_idx = color == "red" red_idx = color == "red"
X, y = X[red_idx, :], y[red_idx] X, y = X[red_idx, :], y[red_idx]
y = (y > 5).astype(int) y = (y > 5).astype(int)
elif name == "wine-q-white": elif name == "wine-q-white":
X, y, color = data["X"], data["y"], data["color"] X, y, color = data["X"], data["y"], data["color"]
# X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1]
white_idx = color == "white" white_idx = color == "white"
X, y = X[white_idx, :], y[white_idx] X, y = X[white_idx, :], y[white_idx]
y = (y > 5).astype(int) y = (y > 5).astype(int)
else: else:
X, y = data["X"], data["y"] X, y = data["X"], data["y"]
# X, y = Xy[:, :-1], Xy[:, -1]
y = binarize(y, pos_class=pos_class[name]) y = binarize(y, pos_class=pos_class[name])

View File

@ -34,7 +34,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
""" """
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=None, n_jobs=None):
super.__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
@abstractmethod @abstractmethod

View File

@ -717,7 +717,7 @@ class EMQ(AggregativeSoftQuantifier):
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.exact_train_prev = exact_train_prev self.exact_train_prev = exact_train_prev
self.calib = calib self.calib = calib
self.on_calib_errors = on_calib_error self.on_calib_error = on_calib_error
self.n_jobs = n_jobs self.n_jobs = n_jobs
@classmethod @classmethod
@ -790,9 +790,9 @@ class EMQ(AggregativeSoftQuantifier):
try: try:
self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True) self.calibration_function = calibrator(P, np.eye(n_classes)[y], posterior_supplied=True)
except Exception as e: except Exception as e:
if self.on_calib_errors == 'raise': if self.on_calib_error == 'raise':
raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}') raise RuntimeError(f'calibration {self.calib} failed at fit time: {e}')
elif self.on_calib_errors == 'backup': elif self.on_calib_error == 'backup':
self.calibration_function = lambda P: P self.calibration_function = lambda P: P
def _calibrate_if_requested(self, uncalib_posteriors): def _calibrate_if_requested(self, uncalib_posteriors):
@ -800,12 +800,12 @@ class EMQ(AggregativeSoftQuantifier):
try: try:
calib_posteriors = self.calibration_function(uncalib_posteriors) calib_posteriors = self.calibration_function(uncalib_posteriors)
except Exception as e: except Exception as e:
if self.on_calib_errors == 'raise': if self.on_calib_error == 'raise':
raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}') raise RuntimeError(f'calibration {self.calib} failed at predict time: {e}')
elif self.on_calib_errors == 'backup': elif self.on_calib_error == 'backup':
calib_posteriors = uncalib_posteriors calib_posteriors = uncalib_posteriors
else: else:
raise ValueError(f'unexpected {self.on_calib_errors=}; ' raise ValueError(f'unexpected {self.on_calib_error=}; '
f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}') f'valid options are {EMQ.ON_CALIB_ERROR_VALUES}')
return calib_posteriors return calib_posteriors
return uncalib_posteriors return uncalib_posteriors

View File

@ -450,8 +450,17 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']` the one indicated in `qp.environ['DEFAULT_CLS']`
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used, :param fit_classifier: whether to train the learner (default is True). Set to False if the
as a stratified held-out validation set, for generating classifier predictions. learner has been trained outside the quantifier.
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
This hyperparameter is only meant to be used when the heuristics are to be applied, i.e., if a
calibration is required. The default value is None (meaning the calibration is not required). In
case this hyperparameter is set to a value other than None, but the calibration is not required
(calib=None), a warning message will be raised.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500) :param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000) :param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0) :param mcmc_seed: random seed for the MCMC sampler (default 0)
@ -462,6 +471,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
""" """
def __init__(self, def __init__(self,
classifier: BaseEstimator=None, classifier: BaseEstimator=None,
fit_classifier=True,
val_split: int = 5, val_split: int = 5,
num_warmup: int = 500, num_warmup: int = 500,
num_samples: int = 1_000, num_samples: int = 1_000,
@ -480,8 +490,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
if _bayesian.DEPENDENCIES_INSTALLED is False: if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
self.classifier = qp._get_classifier(classifier) super().__init__(classifier, fit_classifier, val_split)
self.val_split = val_split
self.num_warmup = num_warmup self.num_warmup = num_warmup
self.num_samples = num_samples self.num_samples = num_samples
self.mcmc_seed = mcmc_seed self.mcmc_seed = mcmc_seed

View File

@ -106,7 +106,6 @@ class TestDatasets(unittest.TestCase):
self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf) self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf)
self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf) self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf)
def test_IFCB(self): def test_IFCB(self):
if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set")

View File

@ -64,7 +64,7 @@ class TestMethods(unittest.TestCase):
q = model() q = model()
print(f'testing {q} on dataset {dataset.name}') print(f'testing {q} on dataset {dataset.name}')
q.fit(dataset.training) q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X) estim_prevalences = q.predict(dataset.test.X)
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))