diff --git a/examples/15.composable_methods.py b/examples/15.composable_methods.py index e8340d4..df3b34c 100644 --- a/examples/15.composable_methods.py +++ b/examples/15.composable_methods.py @@ -22,6 +22,7 @@ data = qp.data.preprocessing.text2tfidf( min_df = 5, ) training, testing = data.train_test +Xtr, ytr = training.Xy # We start by recovering PACC from its building blocks, a LeastSquaresLoss and # a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented @@ -46,7 +47,7 @@ pacc = ComposableQuantifier( # Let's evaluate this quantifier. print(f"Evaluating PACC: {pacc}") -pacc.fit(training) +pacc.fit(Xtr, ytr) app = qp.protocol.APP(testing, sample_size=100, n_prevalences=21, repeats=1) absolute_errors = qp.evaluation.evaluate( model = pacc, @@ -70,7 +71,7 @@ model = ComposableQuantifier( ) print(f"Evaluating {model}") -model.fit(training) +model.fit(Xtr, ytr) absolute_errors = qp.evaluation.evaluate( model = model, protocol = app, # use the same protocol for evaluation @@ -125,7 +126,7 @@ grid_search = qp.model_selection.GridSearchQ( error = "mae", refit = False, verbose = True, -).fit(training) +).fit(Xtr, ytr) print( f"Best hyper-parameters = {grid_search.best_params_}", f"Best MAE = {grid_search.best_score_}", diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index b95cb24..4c2ec1c 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -27,7 +27,8 @@ AGGREGATIVE_METHODS = { aggregative.KDEyML, aggregative.KDEyCS, aggregative.KDEyHD, - confidence.BayesianCC + # aggregative.OneVsAllAggregative, + confidence.BayesianCC, } BINARY_METHODS = { diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index aa4d816..25fc1ef 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1406,18 +1406,20 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): `Gao and Sebastiani, 2016 `_. :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a - one-vs-all manner + one-vs-all manner (default PACC(LogitsticRegression())) :param n_jobs: number of parallel workers :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will is removed and no longer available at predict time. """ - def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'): + def __init__(self, binary_quantifier=None, n_jobs=None, parallel_backend='multiprocessing'): + if binary_quantifier is None: + binary_quantifier = PACC() assert isinstance(binary_quantifier, BaseQuantifier), \ - f'{self.binary_quantifier} does not seem to be a Quantifier' + f'{binary_quantifier} does not seem to be a Quantifier' assert isinstance(binary_quantifier, AggregativeQuantifier), \ - f'{self.binary_quantifier} does not seem to be of type Aggregative' + f'{binary_quantifier} does not seem to be of type Aggregative' self.binary_quantifier = binary_quantifier self.n_jobs = qp._get_njobs(n_jobs) self.parallel_backend = parallel_backend diff --git a/quapy/method/composable.py b/quapy/method/composable.py index 3aacab6..c40e3bb 100644 --- a/quapy/method/composable.py +++ b/quapy/method/composable.py @@ -1,27 +1,28 @@ """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold.""" -__install_istructions = """ +from dataclasses import dataclass +from packaging.version import Version + +from .base import BaseQuantifier + +# what to display when an ImportError is thrown +_IMPORT_ERROR_MESSAGE = """qunfold, the back-end of quapy.method.composable, is not properly installed. + To fix this error, call: pip install --upgrade pip setuptools wheel pip install "jax[cpu]" pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" """ -__import_error_message = ( - "qunfold, the back-end of quapy.method.composable, is not properly installed." + __install_istructions -) -__old_version_message = ( - "The version of qunfold you have installed is not compatible with current quapy's version, " - "which requires qunfold>=0.1.5. " + __install_istructions -) - -from packaging.version import Version +# try to import members of qunfold as members of this module try: import qunfold - from qunfold.quapy import QuaPyWrapper + from qunfold.base import BaseMixin + from qunfold.methods import AbstractMethod from qunfold.sklearn import CVClassifier from qunfold import ( + LinearMethod, # methods LeastSquaresLoss, # losses BlobelLoss, EnergyLoss, @@ -29,37 +30,38 @@ try: CombinedLoss, TikhonovRegularization, TikhonovRegularized, - ClassTransformer, # transformers - HistogramTransformer, - DistanceTransformer, - KernelTransformer, - EnergyKernelTransformer, - LaplacianKernelTransformer, - GaussianKernelTransformer, - GaussianRFFKernelTransformer, + ClassRepresentation, # representations + HistogramRepresentation, + DistanceRepresentation, + KernelRepresentation, + EnergyKernelRepresentation, + LaplacianKernelRepresentation, + GaussianKernelRepresentation, + GaussianRFFKernelRepresentation, ) - - __all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper - "ComposableQuantifier", - "CVClassifier", - "LeastSquaresLoss", - "BlobelLoss", - "EnergyLoss", - "HellingerSurrogateLoss", - "CombinedLoss", - "TikhonovRegularization", - "TikhonovRegularized", - "ClassTransformer", - "HistogramTransformer", - "DistanceTransformer", - "KernelTransformer", - "EnergyKernelTransformer", - "LaplacianKernelTransformer", - "GaussianKernelTransformer", - "GaussianRFFKernelTransformer", - ] except ImportError as e: - raise ImportError(__import_error_message) from e + raise ImportError(_IMPORT_ERROR_MESSAGE) from e + +__all__ = [ # control public members, e.g., for auto-documentation in sphinx + "QUnfoldWrapper", + "ComposableQuantifier", + "CVClassifier", + "LeastSquaresLoss", + "BlobelLoss", + "EnergyLoss", + "HellingerSurrogateLoss", + "CombinedLoss", + "TikhonovRegularization", + "TikhonovRegularized", + "ClassRepresentation", + "HistogramRepresentation", + "DistanceRepresentation", + "KernelRepresentation", + "EnergyKernelRepresentation", + "LaplacianKernelRepresentation", + "GaussianKernelRepresentation", + "GaussianRFFKernelRepresentation", +] def check_compatible_qunfold_version(): @@ -69,18 +71,54 @@ def check_compatible_qunfold_version(): # versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py version_str = "0.1.4" - compatible = Version(version_str) >= Version("0.1.5") + installed_ver = Version(version_str) + required_ver = Version("0.1.5") + compatible = installed_ver.base_version == required_ver.base_version or installed_ver>=required_ver return compatible -def ComposableQuantifier(loss, transformer, **kwargs): +@dataclass +class QUnfoldWrapper(BaseQuantifier,BaseMixin): + """A thin wrapper for using qunfold methods in QuaPy. + + Args: + _method: An instance of `qunfold.methods.AbstractMethod` to wrap. + + Examples: + Here, we wrap an instance of ACC to perform a grid search with QuaPy. + + >>> from qunfold import ACC + >>> qunfold_method = QUnfoldWrapper(ACC(RandomForestClassifier(obb_score=True))) + >>> quapy.model_selection.GridSearchQ( + >>> model = qunfold_method, + >>> param_grid = { # try both splitting criteria + >>> "representation__classifier__estimator__criterion": ["gini", "entropy"], + >>> }, + >>> # ... + >>> ) + """ + _method: AbstractMethod + def fit(self, X, y): # data is a qp.LabelledCollection + self._method.fit(X, y) + return self + def predict(self, X): + return self._method.predict(X) + def set_params(self, **params): + self._method.set_params(**params) + return self + def get_params(self, deep=True): + return self._method.get_params(deep) + def __str__(self): + return self._method.__str__() + +def ComposableQuantifier(loss, representation, **kwargs): """A generic quantification / unfolding method that solves a linear system of equations. This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022). Args: loss: An instance of a loss class from `quapy.methods.composable`. - transformer: An instance of a transformer class from `quapy.methods.composable`. + representation: An instance of a representation class from `quapy.methods.composable`. solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`. solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`. seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`. @@ -92,12 +130,12 @@ def ComposableQuantifier(loss, transformer, **kwargs): >>> ComposableQuantifier, >>> TikhonovRegularized, >>> LeastSquaresLoss, - >>> ClassTransformer, + >>> ClassRepresentation, >>> ) >>> from sklearn.ensemble import RandomForestClassifier >>> o_acc = ComposableQuantifier( >>> TikhonovRegularized(LeastSquaresLoss(), 0.01), - >>> ClassTransformer(RandomForestClassifier(oob_score=True)) + >>> ClassRepresentation(RandomForestClassifier(oob_score=True)) >>> ) Here, we perform hyper-parameter optimization with the ordinal ACC. @@ -105,21 +143,18 @@ def ComposableQuantifier(loss, transformer, **kwargs): >>> quapy.model_selection.GridSearchQ( >>> model = o_acc, >>> param_grid = { # try both splitting criteria - >>> "transformer__classifier__estimator__criterion": ["gini", "entropy"], + >>> "representation__classifier__estimator__criterion": ["gini", "entropy"], >>> }, >>> # ... >>> ) - + To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default. >>> from quapy.method.composable import CVClassifier >>> from sklearn.linear_model import LogisticRegression >>> acc_lr = ComposableQuantifier( >>> LeastSquaresLoss(), - >>> ClassTransformer(CVClassifier(LogisticRegression(), 10)) + >>> ClassRepresentation(CVClassifier(LogisticRegression(), 10)) >>> ) """ - if not check_compatible_qunfold_version(): - raise ImportError(__old_version_message) - - return QuaPyWrapper(qunfold.GenericMethod(loss, transformer, **kwargs)) + return QUnfoldWrapper(LinearMethod(loss, representation, **kwargs)) \ No newline at end of file diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index cc09f16..de5f61a 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -15,8 +15,11 @@ class TestDatasets(unittest.TestCase): return PCC(LogisticRegression(C=0.001, max_iter=100)) def _check_dataset(self, dataset): + train, test = dataset.reduce().train_test q = self.new_quantifier() print(f'testing method {q} in {dataset.name}...', end='') + if len(train)>500: + train = train.sampling(500) q.fit(*dataset.training.Xy) estim_prevalences = q.predict(dataset.test.instances) self.assertTrue(F.check_prevalence_vector(estim_prevalences)) @@ -42,7 +45,9 @@ class TestDatasets(unittest.TestCase): self._check_dataset(dataset) def test_twitter(self): - for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST: + # all the datasets are contained in the same resource; if the first one + # works, there is no need to test for the rest + for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST[:1]: print(f'loading dataset {dataset_name}...', end='') dataset = fetch_twitter(dataset_name, min_df=10) dataset.stats() @@ -129,7 +134,7 @@ class TestDatasets(unittest.TestCase): n_classes = train.n_classes train = train.sampling(100, *F.uniform_prevalence(n_classes)) q = self.new_quantifier() - q.fit(train) + q.fit(*train.Xy) self._check_samples(gen, q, max_samples_test=5) diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py index 0cf9b9b..450e3cf 100644 --- a/quapy/tests/test_hierarchy.py +++ b/quapy/tests/test_hierarchy.py @@ -9,9 +9,8 @@ import inspect class HierarchyTestCase(unittest.TestCase): def test_aggregative(self): - lr = LogisticRegression() for m in AGGREGATIVE_METHODS: - self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True) + self.assertEqual(isinstance(m(), AggregativeQuantifier), True) def test_inspect_aggregative(self): diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index 533bf1a..d8f8da7 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -4,6 +4,7 @@ import unittest from sklearn.linear_model import LogisticRegression import quapy as qp +from method.aggregative import OneVsAllAggregative from quapy.method.aggregative import ACC from quapy.method.meta import Ensemble from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS @@ -16,21 +17,21 @@ from quapy.method.composable import ( ComposableQuantifier, LeastSquaresLoss, HellingerSurrogateLoss, - ClassTransformer, - HistogramTransformer, + ClassRepresentation, + HistogramRepresentation, CVClassifier ) COMPOSABLE_METHODS = [ ComposableQuantifier( # ACC LeastSquaresLoss(), - ClassTransformer(CVClassifier(LogisticRegression())) + ClassRepresentation(CVClassifier(LogisticRegression())) ), ComposableQuantifier( # HDy HellingerSurrogateLoss(), - HistogramTransformer( + HistogramRepresentation( 3, # 3 bins per class - preprocessor = ClassTransformer(CVClassifier(LogisticRegression())) + preprocessor = ClassRepresentation(CVClassifier(LogisticRegression())) ) ), ] @@ -113,7 +114,6 @@ class TestMethods(unittest.TestCase): self.assertTrue(check_prevalence_vector(estim_prevalences)) def test_composable(self): - from packaging.version import Version if check_compatible_qunfold_version(): for dataset in TestMethods.datasets: for q in COMPOSABLE_METHODS: diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index c13b665..6423b4e 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -39,31 +39,30 @@ class ModselTestCase(unittest.TestCase): obtains the same optimal parameters """ - q = PACC(LogisticRegression(random_state=1, max_iter=500)) + q = PACC(LogisticRegression(random_state=1, max_iter=3000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50).reduce(n_train=500, random_state=1) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50) training, validation = data.training.split_stratified(0.7, random_state=1) - param_grid = {'classifier__C': np.logspace(-3,3,7)} + param_grid = {'classifier__C': np.logspace(-3,3,7), 'classifier__class_weight': ['balanced', None]} app = APP(validation, sample_size=100, random_state=1) - print('starting model selection in sequential exploration') - tinit = time.time() - modsel = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True - ).fit(*training.Xy) - tend_seq = time.time()-tinit - best_c_seq = modsel.best_params_['classifier__C'] - print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}') + def do_gridsearch(n_jobs): + print('starting model selection in sequential exploration') + t_init = time.time() + modsel = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=n_jobs, verbose=True + ).fit(*training.Xy) + t_end = time.time()-t_init + best_c = modsel.best_params_['classifier__C'] + print(f'[done] took {t_end:.2f}s best C = {best_c}') + return t_end, best_c - print('starting model selection in parallel exploration') - tinit = time.time() - modsel = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True - ).fit(*training.Xy) - tend_par = time.time() - tinit - best_c_par = modsel.best_params_['classifier__C'] - print(f'[done] took {tend_par:.2f}s best C = {best_c_par}') + tend_seq, best_c_seq = do_gridsearch(n_jobs=1) + tend_par, best_c_par = do_gridsearch(n_jobs=-1) + + print(tend_seq, best_c_seq) + print(tend_par, best_c_par) self.assertEqual(best_c_seq, best_c_par) self.assertLess(tend_par, tend_seq)