mergin and solving pytests

2025-10-06 12:03:31 +02:00 · 2025-10-06 12:03:31 +02:00 · 3847db3838
parent dbda25b09a
commit 3847db3838
8 changed files with 131 additions and 89 deletions
--- a/examples/15.composable_methods.py
+++ b/examples/15.composable_methods.py
@ -22,6 +22,7 @@ data = qp.data.preprocessing.text2tfidf(
    min_df = 5,
 )
 training, testing = data.train_test
 Xtr, ytr = training.Xy
 # We start by recovering PACC from its building blocks, a LeastSquaresLoss and
 # a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented
@ -46,7 +47,7 @@ pacc = ComposableQuantifier(
 # Let's evaluate this quantifier.
 print(f"Evaluating PACC: {pacc}")
-pacc.fit(training)
+pacc.fit(Xtr, ytr)
 app = qp.protocol.APP(testing, sample_size=100, n_prevalences=21, repeats=1)
 absolute_errors = qp.evaluation.evaluate(
    model = pacc,
@ -70,7 +71,7 @@ model = ComposableQuantifier(
 )
 print(f"Evaluating {model}")
-model.fit(training)
+model.fit(Xtr, ytr)
 absolute_errors = qp.evaluation.evaluate(
    model = model,
    protocol = app, # use the same protocol for evaluation
@ -125,7 +126,7 @@ grid_search = qp.model_selection.GridSearchQ(
    error = "mae",
    refit = False,
    verbose = True,
-).fit(training)
+).fit(Xtr, ytr)
 print(
    f"Best hyper-parameters = {grid_search.best_params_}",
    f"Best MAE = {grid_search.best_score_}",
--- a/quapy/method/init.py
+++ b/quapy/method/init.py
@ -27,7 +27,8 @@ AGGREGATIVE_METHODS = {
    aggregative.KDEyML,
    aggregative.KDEyCS,
    aggregative.KDEyHD,
-    confidence.BayesianCC
+    # aggregative.OneVsAllAggregative,
    confidence.BayesianCC,
 }
 BINARY_METHODS = {
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1406,18 +1406,20 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
    `Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_.
    :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a
-        one-vs-all manner
+        one-vs-all manner (default PACC(LogitsticRegression()))
    :param n_jobs: number of parallel workers
    :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers
        (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will
        is removed and no longer available at predict time.
    """
-    def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'):
+    def __init__(self, binary_quantifier=None, n_jobs=None, parallel_backend='multiprocessing'):
        if binary_quantifier is None:
            binary_quantifier = PACC()
        assert isinstance(binary_quantifier, BaseQuantifier), \
-            f'{self.binary_quantifier} does not seem to be a Quantifier'
+            f'{binary_quantifier} does not seem to be a Quantifier'
        assert isinstance(binary_quantifier, AggregativeQuantifier), \
-            f'{self.binary_quantifier} does not seem to be of type Aggregative'
+            f'{binary_quantifier} does not seem to be of type Aggregative'
        self.binary_quantifier = binary_quantifier
        self.n_jobs = qp._get_njobs(n_jobs)
        self.parallel_backend = parallel_backend
--- a/quapy/method/composable.py
+++ b/quapy/method/composable.py
@ -1,27 +1,28 @@
 """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold."""
-__install_istructions = """
+from dataclasses import dataclass
 from packaging.version import Version
 from .base import BaseQuantifier
 # what to display when an ImportError is thrown
 _IMPORT_ERROR_MESSAGE = """qunfold, the back-end of quapy.method.composable, is not properly installed.
 To fix this error, call:
    pip install --upgrade pip setuptools wheel
    pip install "jax[cpu]"
    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5"
 """
 __import_error_message = (
        "qunfold, the back-end of quapy.method.composable, is not properly installed." + __install_istructions
 )
 __old_version_message = (
    "The version of qunfold you have installed is not compatible with current quapy's version, "
    "which requires qunfold>=0.1.5. " + __install_istructions
 )
 from packaging.version import Version
 # try to import members of qunfold as members of this module
 try:
    import qunfold
-    from qunfold.quapy import QuaPyWrapper
+    from qunfold.base import BaseMixin
    from qunfold.methods import AbstractMethod
    from qunfold.sklearn import CVClassifier
    from qunfold import (
        LinearMethod, # methods
        LeastSquaresLoss, # losses
        BlobelLoss,
        EnergyLoss,
@ -29,37 +30,38 @@ try:
        CombinedLoss,
        TikhonovRegularization,
        TikhonovRegularized,
-        ClassTransformer, # transformers
+        ClassRepresentation, # representations
-        HistogramTransformer,
+        HistogramRepresentation,
-        DistanceTransformer,
+        DistanceRepresentation,
-        KernelTransformer,
+        KernelRepresentation,
-        EnergyKernelTransformer,
+        EnergyKernelRepresentation,
-        LaplacianKernelTransformer,
+        LaplacianKernelRepresentation,
-        GaussianKernelTransformer,
+        GaussianKernelRepresentation,
-        GaussianRFFKernelTransformer,
+        GaussianRFFKernelRepresentation,
    )
    __all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper
        "ComposableQuantifier",
        "CVClassifier",
        "LeastSquaresLoss",
        "BlobelLoss",
        "EnergyLoss",
        "HellingerSurrogateLoss",
        "CombinedLoss",
        "TikhonovRegularization",
        "TikhonovRegularized",
        "ClassTransformer",
        "HistogramTransformer",
        "DistanceTransformer",
        "KernelTransformer",
        "EnergyKernelTransformer",
        "LaplacianKernelTransformer",
        "GaussianKernelTransformer",
        "GaussianRFFKernelTransformer",
    ]
 except ImportError as e:
-    raise ImportError(__import_error_message) from e
+    raise ImportError(_IMPORT_ERROR_MESSAGE) from e
 __all__ = [ # control public members, e.g., for auto-documentation in sphinx
    "QUnfoldWrapper",
    "ComposableQuantifier",
    "CVClassifier",
    "LeastSquaresLoss",
    "BlobelLoss",
    "EnergyLoss",
    "HellingerSurrogateLoss",
    "CombinedLoss",
    "TikhonovRegularization",
    "TikhonovRegularized",
    "ClassRepresentation",
    "HistogramRepresentation",
    "DistanceRepresentation",
    "KernelRepresentation",
    "EnergyKernelRepresentation",
    "LaplacianKernelRepresentation",
    "GaussianKernelRepresentation",
    "GaussianRFFKernelRepresentation",
 ]
 def check_compatible_qunfold_version():
@ -69,18 +71,54 @@ def check_compatible_qunfold_version():
        # versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py
        version_str = "0.1.4"
-    compatible = Version(version_str) >= Version("0.1.5")
+    installed_ver = Version(version_str)
    required_ver = Version("0.1.5")
    compatible = installed_ver.base_version == required_ver.base_version or installed_ver>=required_ver
    return compatible
-def ComposableQuantifier(loss, transformer, **kwargs):
+@dataclass
 class QUnfoldWrapper(BaseQuantifier,BaseMixin):
    """A thin wrapper for using qunfold methods in QuaPy.
    Args:
      _method: An instance of `qunfold.methods.AbstractMethod` to wrap.
    Examples:
      Here, we wrap an instance of ACC to perform a grid search with QuaPy.
        >>> from qunfold import ACC
        >>> qunfold_method = QUnfoldWrapper(ACC(RandomForestClassifier(obb_score=True)))
        >>> quapy.model_selection.GridSearchQ(
        >>>     model = qunfold_method,
        >>>     param_grid = { # try both splitting criteria
        >>>         "representation__classifier__estimator__criterion": ["gini", "entropy"],
        >>>     },
        >>>     # ...
        >>> )
    """
    _method: AbstractMethod
    def fit(self, X, y): # data is a qp.LabelledCollection
        self._method.fit(X, y)
        return self
    def predict(self, X):
        return self._method.predict(X)
    def set_params(self, **params):
        self._method.set_params(**params)
        return self
    def get_params(self, deep=True):
        return self._method.get_params(deep)
    def __str__(self):
        return self._method.__str__()
 def ComposableQuantifier(loss, representation, **kwargs):
    """A generic quantification / unfolding method that solves a linear system of equations.
    This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022).
    Args:
        loss: An instance of a loss class from `quapy.methods.composable`.
-        transformer: An instance of a transformer class from `quapy.methods.composable`.
+        representation: An instance of a representation class from `quapy.methods.composable`.
        solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`.
        solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`.
        seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`.
@ -92,12 +130,12 @@ def ComposableQuantifier(loss, transformer, **kwargs):
            >>>     ComposableQuantifier,
            >>>     TikhonovRegularized,
            >>>     LeastSquaresLoss,
-            >>>     ClassTransformer,
+            >>>     ClassRepresentation,
            >>> )
            >>> from sklearn.ensemble import RandomForestClassifier
            >>> o_acc = ComposableQuantifier(
            >>>     TikhonovRegularized(LeastSquaresLoss(), 0.01),
-            >>>     ClassTransformer(RandomForestClassifier(oob_score=True))
+            >>>     ClassRepresentation(RandomForestClassifier(oob_score=True))
            >>> )
        Here, we perform hyper-parameter optimization with the ordinal ACC.
@ -105,21 +143,18 @@ def ComposableQuantifier(loss, transformer, **kwargs):
            >>> quapy.model_selection.GridSearchQ(
            >>>     model = o_acc,
            >>>     param_grid = { # try both splitting criteria
-            >>>         "transformer__classifier__estimator__criterion": ["gini", "entropy"],
+            >>>         "representation__classifier__estimator__criterion": ["gini", "entropy"],
            >>>     },
            >>>     # ...
            >>> )
-        
+
        To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default.
            >>> from quapy.method.composable import CVClassifier
            >>> from sklearn.linear_model import LogisticRegression
            >>> acc_lr = ComposableQuantifier(
            >>>     LeastSquaresLoss(),
-            >>>     ClassTransformer(CVClassifier(LogisticRegression(), 10))
+            >>>     ClassRepresentation(CVClassifier(LogisticRegression(), 10))
            >>> )
        """
-    if not check_compatible_qunfold_version():
+    return QUnfoldWrapper(LinearMethod(loss, representation, **kwargs))
        raise ImportError(__old_version_message)
    return QuaPyWrapper(qunfold.GenericMethod(loss, transformer, **kwargs))
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@ -15,8 +15,11 @@ class TestDatasets(unittest.TestCase):
        return PCC(LogisticRegression(C=0.001, max_iter=100))
    def _check_dataset(self, dataset):
        train, test = dataset.reduce().train_test
        q = self.new_quantifier()
        print(f'testing method {q} in {dataset.name}...', end='')
        if len(train)>500:
            train = train.sampling(500)
        q.fit(*dataset.training.Xy)
        estim_prevalences = q.predict(dataset.test.instances)
        self.assertTrue(F.check_prevalence_vector(estim_prevalences))
@ -42,7 +45,9 @@ class TestDatasets(unittest.TestCase):
            self._check_dataset(dataset)
    def test_twitter(self):
-        for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST:
+        # all the datasets are contained in the same resource; if the first one
        # works, there is no need to test for the rest
        for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST[:1]:
            print(f'loading dataset {dataset_name}...', end='')
            dataset = fetch_twitter(dataset_name, min_df=10)
            dataset.stats()
@ -129,7 +134,7 @@ class TestDatasets(unittest.TestCase):
            n_classes = train.n_classes
            train = train.sampling(100, *F.uniform_prevalence(n_classes))
            q = self.new_quantifier()
-            q.fit(train)
+            q.fit(*train.Xy)
            self._check_samples(gen, q, max_samples_test=5)
--- a/quapy/tests/test_hierarchy.py
+++ b/quapy/tests/test_hierarchy.py
@ -9,9 +9,8 @@ import inspect
 class HierarchyTestCase(unittest.TestCase):
    def test_aggregative(self):
        lr = LogisticRegression()
        for m in AGGREGATIVE_METHODS:
-            self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True)
+            self.assertEqual(isinstance(m(), AggregativeQuantifier), True)
    def test_inspect_aggregative(self):
--- a/quapy/tests/test_methods.py
+++ b/quapy/tests/test_methods.py
@ -4,6 +4,7 @@ import unittest
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 from method.aggregative import OneVsAllAggregative
 from quapy.method.aggregative import ACC
 from quapy.method.meta import Ensemble
 from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS
@ -16,21 +17,21 @@ from quapy.method.composable import (
    ComposableQuantifier,
    LeastSquaresLoss,
    HellingerSurrogateLoss,
-    ClassTransformer,
+    ClassRepresentation,
-    HistogramTransformer,
+    HistogramRepresentation,
    CVClassifier
 )
 COMPOSABLE_METHODS = [
    ComposableQuantifier( # ACC
        LeastSquaresLoss(),
-        ClassTransformer(CVClassifier(LogisticRegression()))
+        ClassRepresentation(CVClassifier(LogisticRegression()))
    ),
    ComposableQuantifier( # HDy
        HellingerSurrogateLoss(),
-        HistogramTransformer(
+        HistogramRepresentation(
            3, # 3 bins per class
-            preprocessor = ClassTransformer(CVClassifier(LogisticRegression()))
+            preprocessor = ClassRepresentation(CVClassifier(LogisticRegression()))
        )
    ),
 ]
@ -113,7 +114,6 @@ class TestMethods(unittest.TestCase):
        self.assertTrue(check_prevalence_vector(estim_prevalences))
    def test_composable(self):
        from packaging.version import Version
        if check_compatible_qunfold_version():
            for dataset in TestMethods.datasets:
                for q in COMPOSABLE_METHODS:
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@ -39,31 +39,30 @@ class ModselTestCase(unittest.TestCase):
        obtains the same optimal parameters
        """
-        q = PACC(LogisticRegression(random_state=1, max_iter=500))
+        q = PACC(LogisticRegression(random_state=1, max_iter=3000))
-        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50).reduce(n_train=500, random_state=1)
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50)
        training, validation = data.training.split_stratified(0.7, random_state=1)
-        param_grid = {'classifier__C': np.logspace(-3,3,7)}
+        param_grid = {'classifier__C': np.logspace(-3,3,7), 'classifier__class_weight': ['balanced', None]}
        app = APP(validation, sample_size=100, random_state=1)
-        print('starting model selection in sequential exploration')
+        def do_gridsearch(n_jobs):
-        tinit = time.time()
+            print('starting model selection in sequential exploration')
-        modsel = GridSearchQ(
+            t_init = time.time()
-            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
+            modsel = GridSearchQ(
-        ).fit(*training.Xy)
+                q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=n_jobs, verbose=True
-        tend_seq = time.time()-tinit
+            ).fit(*training.Xy)
-        best_c_seq = modsel.best_params_['classifier__C']
+            t_end = time.time()-t_init
-        print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}')
+            best_c = modsel.best_params_['classifier__C']
            print(f'[done] took {t_end:.2f}s best C = {best_c}')
            return t_end, best_c
-        print('starting model selection in parallel exploration')
+        tend_seq, best_c_seq = do_gridsearch(n_jobs=1)
-        tinit = time.time()
+        tend_par, best_c_par = do_gridsearch(n_jobs=-1)
-        modsel = GridSearchQ(
+
-            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
+        print(tend_seq, best_c_seq)
-        ).fit(*training.Xy)
+        print(tend_par, best_c_par)
        tend_par = time.time() - tinit
        best_c_par = modsel.best_params_['classifier__C']
        print(f'[done] took {tend_par:.2f}s best C = {best_c_par}')
        self.assertEqual(best_c_seq, best_c_par)
        self.assertLess(tend_par, tend_seq)