mergin and solving pytests

This commit is contained in:
Alejandro Moreo Fernandez 2025-10-06 12:03:31 +02:00
parent dbda25b09a
commit 3847db3838
8 changed files with 131 additions and 89 deletions

View File

@ -22,6 +22,7 @@ data = qp.data.preprocessing.text2tfidf(
min_df = 5, min_df = 5,
) )
training, testing = data.train_test training, testing = data.train_test
Xtr, ytr = training.Xy
# We start by recovering PACC from its building blocks, a LeastSquaresLoss and # We start by recovering PACC from its building blocks, a LeastSquaresLoss and
# a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented # a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented
@ -46,7 +47,7 @@ pacc = ComposableQuantifier(
# Let's evaluate this quantifier. # Let's evaluate this quantifier.
print(f"Evaluating PACC: {pacc}") print(f"Evaluating PACC: {pacc}")
pacc.fit(training) pacc.fit(Xtr, ytr)
app = qp.protocol.APP(testing, sample_size=100, n_prevalences=21, repeats=1) app = qp.protocol.APP(testing, sample_size=100, n_prevalences=21, repeats=1)
absolute_errors = qp.evaluation.evaluate( absolute_errors = qp.evaluation.evaluate(
model = pacc, model = pacc,
@ -70,7 +71,7 @@ model = ComposableQuantifier(
) )
print(f"Evaluating {model}") print(f"Evaluating {model}")
model.fit(training) model.fit(Xtr, ytr)
absolute_errors = qp.evaluation.evaluate( absolute_errors = qp.evaluation.evaluate(
model = model, model = model,
protocol = app, # use the same protocol for evaluation protocol = app, # use the same protocol for evaluation
@ -125,7 +126,7 @@ grid_search = qp.model_selection.GridSearchQ(
error = "mae", error = "mae",
refit = False, refit = False,
verbose = True, verbose = True,
).fit(training) ).fit(Xtr, ytr)
print( print(
f"Best hyper-parameters = {grid_search.best_params_}", f"Best hyper-parameters = {grid_search.best_params_}",
f"Best MAE = {grid_search.best_score_}", f"Best MAE = {grid_search.best_score_}",

View File

@ -27,7 +27,8 @@ AGGREGATIVE_METHODS = {
aggregative.KDEyML, aggregative.KDEyML,
aggregative.KDEyCS, aggregative.KDEyCS,
aggregative.KDEyHD, aggregative.KDEyHD,
confidence.BayesianCC # aggregative.OneVsAllAggregative,
confidence.BayesianCC,
} }
BINARY_METHODS = { BINARY_METHODS = {

View File

@ -1406,18 +1406,20 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
`Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_. `Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_.
:param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a
one-vs-all manner one-vs-all manner (default PACC(LogitsticRegression()))
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
:param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers
(e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will
is removed and no longer available at predict time. is removed and no longer available at predict time.
""" """
def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'): def __init__(self, binary_quantifier=None, n_jobs=None, parallel_backend='multiprocessing'):
if binary_quantifier is None:
binary_quantifier = PACC()
assert isinstance(binary_quantifier, BaseQuantifier), \ assert isinstance(binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier' f'{binary_quantifier} does not seem to be a Quantifier'
assert isinstance(binary_quantifier, AggregativeQuantifier), \ assert isinstance(binary_quantifier, AggregativeQuantifier), \
f'{self.binary_quantifier} does not seem to be of type Aggregative' f'{binary_quantifier} does not seem to be of type Aggregative'
self.binary_quantifier = binary_quantifier self.binary_quantifier = binary_quantifier
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
self.parallel_backend = parallel_backend self.parallel_backend = parallel_backend

View File

@ -1,27 +1,28 @@
"""This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold.""" """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold."""
__install_istructions = """ from dataclasses import dataclass
from packaging.version import Version
from .base import BaseQuantifier
# what to display when an ImportError is thrown
_IMPORT_ERROR_MESSAGE = """qunfold, the back-end of quapy.method.composable, is not properly installed.
To fix this error, call: To fix this error, call:
pip install --upgrade pip setuptools wheel pip install --upgrade pip setuptools wheel
pip install "jax[cpu]" pip install "jax[cpu]"
pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5"
""" """
__import_error_message = (
"qunfold, the back-end of quapy.method.composable, is not properly installed." + __install_istructions
)
__old_version_message = (
"The version of qunfold you have installed is not compatible with current quapy's version, "
"which requires qunfold>=0.1.5. " + __install_istructions
)
from packaging.version import Version
# try to import members of qunfold as members of this module
try: try:
import qunfold import qunfold
from qunfold.quapy import QuaPyWrapper from qunfold.base import BaseMixin
from qunfold.methods import AbstractMethod
from qunfold.sklearn import CVClassifier from qunfold.sklearn import CVClassifier
from qunfold import ( from qunfold import (
LinearMethod, # methods
LeastSquaresLoss, # losses LeastSquaresLoss, # losses
BlobelLoss, BlobelLoss,
EnergyLoss, EnergyLoss,
@ -29,37 +30,38 @@ try:
CombinedLoss, CombinedLoss,
TikhonovRegularization, TikhonovRegularization,
TikhonovRegularized, TikhonovRegularized,
ClassTransformer, # transformers ClassRepresentation, # representations
HistogramTransformer, HistogramRepresentation,
DistanceTransformer, DistanceRepresentation,
KernelTransformer, KernelRepresentation,
EnergyKernelTransformer, EnergyKernelRepresentation,
LaplacianKernelTransformer, LaplacianKernelRepresentation,
GaussianKernelTransformer, GaussianKernelRepresentation,
GaussianRFFKernelTransformer, GaussianRFFKernelRepresentation,
) )
__all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper
"ComposableQuantifier",
"CVClassifier",
"LeastSquaresLoss",
"BlobelLoss",
"EnergyLoss",
"HellingerSurrogateLoss",
"CombinedLoss",
"TikhonovRegularization",
"TikhonovRegularized",
"ClassTransformer",
"HistogramTransformer",
"DistanceTransformer",
"KernelTransformer",
"EnergyKernelTransformer",
"LaplacianKernelTransformer",
"GaussianKernelTransformer",
"GaussianRFFKernelTransformer",
]
except ImportError as e: except ImportError as e:
raise ImportError(__import_error_message) from e raise ImportError(_IMPORT_ERROR_MESSAGE) from e
__all__ = [ # control public members, e.g., for auto-documentation in sphinx
"QUnfoldWrapper",
"ComposableQuantifier",
"CVClassifier",
"LeastSquaresLoss",
"BlobelLoss",
"EnergyLoss",
"HellingerSurrogateLoss",
"CombinedLoss",
"TikhonovRegularization",
"TikhonovRegularized",
"ClassRepresentation",
"HistogramRepresentation",
"DistanceRepresentation",
"KernelRepresentation",
"EnergyKernelRepresentation",
"LaplacianKernelRepresentation",
"GaussianKernelRepresentation",
"GaussianRFFKernelRepresentation",
]
def check_compatible_qunfold_version(): def check_compatible_qunfold_version():
@ -69,18 +71,54 @@ def check_compatible_qunfold_version():
# versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py # versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py
version_str = "0.1.4" version_str = "0.1.4"
compatible = Version(version_str) >= Version("0.1.5") installed_ver = Version(version_str)
required_ver = Version("0.1.5")
compatible = installed_ver.base_version == required_ver.base_version or installed_ver>=required_ver
return compatible return compatible
def ComposableQuantifier(loss, transformer, **kwargs): @dataclass
class QUnfoldWrapper(BaseQuantifier,BaseMixin):
"""A thin wrapper for using qunfold methods in QuaPy.
Args:
_method: An instance of `qunfold.methods.AbstractMethod` to wrap.
Examples:
Here, we wrap an instance of ACC to perform a grid search with QuaPy.
>>> from qunfold import ACC
>>> qunfold_method = QUnfoldWrapper(ACC(RandomForestClassifier(obb_score=True)))
>>> quapy.model_selection.GridSearchQ(
>>> model = qunfold_method,
>>> param_grid = { # try both splitting criteria
>>> "representation__classifier__estimator__criterion": ["gini", "entropy"],
>>> },
>>> # ...
>>> )
"""
_method: AbstractMethod
def fit(self, X, y): # data is a qp.LabelledCollection
self._method.fit(X, y)
return self
def predict(self, X):
return self._method.predict(X)
def set_params(self, **params):
self._method.set_params(**params)
return self
def get_params(self, deep=True):
return self._method.get_params(deep)
def __str__(self):
return self._method.__str__()
def ComposableQuantifier(loss, representation, **kwargs):
"""A generic quantification / unfolding method that solves a linear system of equations. """A generic quantification / unfolding method that solves a linear system of equations.
This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022). This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022).
Args: Args:
loss: An instance of a loss class from `quapy.methods.composable`. loss: An instance of a loss class from `quapy.methods.composable`.
transformer: An instance of a transformer class from `quapy.methods.composable`. representation: An instance of a representation class from `quapy.methods.composable`.
solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`. solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`.
solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`. solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`.
seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`. seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`.
@ -92,12 +130,12 @@ def ComposableQuantifier(loss, transformer, **kwargs):
>>> ComposableQuantifier, >>> ComposableQuantifier,
>>> TikhonovRegularized, >>> TikhonovRegularized,
>>> LeastSquaresLoss, >>> LeastSquaresLoss,
>>> ClassTransformer, >>> ClassRepresentation,
>>> ) >>> )
>>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.ensemble import RandomForestClassifier
>>> o_acc = ComposableQuantifier( >>> o_acc = ComposableQuantifier(
>>> TikhonovRegularized(LeastSquaresLoss(), 0.01), >>> TikhonovRegularized(LeastSquaresLoss(), 0.01),
>>> ClassTransformer(RandomForestClassifier(oob_score=True)) >>> ClassRepresentation(RandomForestClassifier(oob_score=True))
>>> ) >>> )
Here, we perform hyper-parameter optimization with the ordinal ACC. Here, we perform hyper-parameter optimization with the ordinal ACC.
@ -105,21 +143,18 @@ def ComposableQuantifier(loss, transformer, **kwargs):
>>> quapy.model_selection.GridSearchQ( >>> quapy.model_selection.GridSearchQ(
>>> model = o_acc, >>> model = o_acc,
>>> param_grid = { # try both splitting criteria >>> param_grid = { # try both splitting criteria
>>> "transformer__classifier__estimator__criterion": ["gini", "entropy"], >>> "representation__classifier__estimator__criterion": ["gini", "entropy"],
>>> }, >>> },
>>> # ... >>> # ...
>>> ) >>> )
To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default. To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default.
>>> from quapy.method.composable import CVClassifier >>> from quapy.method.composable import CVClassifier
>>> from sklearn.linear_model import LogisticRegression >>> from sklearn.linear_model import LogisticRegression
>>> acc_lr = ComposableQuantifier( >>> acc_lr = ComposableQuantifier(
>>> LeastSquaresLoss(), >>> LeastSquaresLoss(),
>>> ClassTransformer(CVClassifier(LogisticRegression(), 10)) >>> ClassRepresentation(CVClassifier(LogisticRegression(), 10))
>>> ) >>> )
""" """
if not check_compatible_qunfold_version(): return QUnfoldWrapper(LinearMethod(loss, representation, **kwargs))
raise ImportError(__old_version_message)
return QuaPyWrapper(qunfold.GenericMethod(loss, transformer, **kwargs))

View File

@ -15,8 +15,11 @@ class TestDatasets(unittest.TestCase):
return PCC(LogisticRegression(C=0.001, max_iter=100)) return PCC(LogisticRegression(C=0.001, max_iter=100))
def _check_dataset(self, dataset): def _check_dataset(self, dataset):
train, test = dataset.reduce().train_test
q = self.new_quantifier() q = self.new_quantifier()
print(f'testing method {q} in {dataset.name}...', end='') print(f'testing method {q} in {dataset.name}...', end='')
if len(train)>500:
train = train.sampling(500)
q.fit(*dataset.training.Xy) q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.instances) estim_prevalences = q.predict(dataset.test.instances)
self.assertTrue(F.check_prevalence_vector(estim_prevalences)) self.assertTrue(F.check_prevalence_vector(estim_prevalences))
@ -42,7 +45,9 @@ class TestDatasets(unittest.TestCase):
self._check_dataset(dataset) self._check_dataset(dataset)
def test_twitter(self): def test_twitter(self):
for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST: # all the datasets are contained in the same resource; if the first one
# works, there is no need to test for the rest
for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST[:1]:
print(f'loading dataset {dataset_name}...', end='') print(f'loading dataset {dataset_name}...', end='')
dataset = fetch_twitter(dataset_name, min_df=10) dataset = fetch_twitter(dataset_name, min_df=10)
dataset.stats() dataset.stats()
@ -129,7 +134,7 @@ class TestDatasets(unittest.TestCase):
n_classes = train.n_classes n_classes = train.n_classes
train = train.sampling(100, *F.uniform_prevalence(n_classes)) train = train.sampling(100, *F.uniform_prevalence(n_classes))
q = self.new_quantifier() q = self.new_quantifier()
q.fit(train) q.fit(*train.Xy)
self._check_samples(gen, q, max_samples_test=5) self._check_samples(gen, q, max_samples_test=5)

View File

@ -9,9 +9,8 @@ import inspect
class HierarchyTestCase(unittest.TestCase): class HierarchyTestCase(unittest.TestCase):
def test_aggregative(self): def test_aggregative(self):
lr = LogisticRegression()
for m in AGGREGATIVE_METHODS: for m in AGGREGATIVE_METHODS:
self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True) self.assertEqual(isinstance(m(), AggregativeQuantifier), True)
def test_inspect_aggregative(self): def test_inspect_aggregative(self):

View File

@ -4,6 +4,7 @@ import unittest
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quapy as qp import quapy as qp
from method.aggregative import OneVsAllAggregative
from quapy.method.aggregative import ACC from quapy.method.aggregative import ACC
from quapy.method.meta import Ensemble from quapy.method.meta import Ensemble
from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS from quapy.method import AGGREGATIVE_METHODS, BINARY_METHODS, NON_AGGREGATIVE_METHODS
@ -16,21 +17,21 @@ from quapy.method.composable import (
ComposableQuantifier, ComposableQuantifier,
LeastSquaresLoss, LeastSquaresLoss,
HellingerSurrogateLoss, HellingerSurrogateLoss,
ClassTransformer, ClassRepresentation,
HistogramTransformer, HistogramRepresentation,
CVClassifier CVClassifier
) )
COMPOSABLE_METHODS = [ COMPOSABLE_METHODS = [
ComposableQuantifier( # ACC ComposableQuantifier( # ACC
LeastSquaresLoss(), LeastSquaresLoss(),
ClassTransformer(CVClassifier(LogisticRegression())) ClassRepresentation(CVClassifier(LogisticRegression()))
), ),
ComposableQuantifier( # HDy ComposableQuantifier( # HDy
HellingerSurrogateLoss(), HellingerSurrogateLoss(),
HistogramTransformer( HistogramRepresentation(
3, # 3 bins per class 3, # 3 bins per class
preprocessor = ClassTransformer(CVClassifier(LogisticRegression())) preprocessor = ClassRepresentation(CVClassifier(LogisticRegression()))
) )
), ),
] ]
@ -113,7 +114,6 @@ class TestMethods(unittest.TestCase):
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))
def test_composable(self): def test_composable(self):
from packaging.version import Version
if check_compatible_qunfold_version(): if check_compatible_qunfold_version():
for dataset in TestMethods.datasets: for dataset in TestMethods.datasets:
for q in COMPOSABLE_METHODS: for q in COMPOSABLE_METHODS:

View File

@ -39,31 +39,30 @@ class ModselTestCase(unittest.TestCase):
obtains the same optimal parameters obtains the same optimal parameters
""" """
q = PACC(LogisticRegression(random_state=1, max_iter=500)) q = PACC(LogisticRegression(random_state=1, max_iter=3000))
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50).reduce(n_train=500, random_state=1) data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50)
training, validation = data.training.split_stratified(0.7, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'classifier__C': np.logspace(-3,3,7)} param_grid = {'classifier__C': np.logspace(-3,3,7), 'classifier__class_weight': ['balanced', None]}
app = APP(validation, sample_size=100, random_state=1) app = APP(validation, sample_size=100, random_state=1)
print('starting model selection in sequential exploration') def do_gridsearch(n_jobs):
tinit = time.time() print('starting model selection in sequential exploration')
modsel = GridSearchQ( t_init = time.time()
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True modsel = GridSearchQ(
).fit(*training.Xy) q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=n_jobs, verbose=True
tend_seq = time.time()-tinit ).fit(*training.Xy)
best_c_seq = modsel.best_params_['classifier__C'] t_end = time.time()-t_init
print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}') best_c = modsel.best_params_['classifier__C']
print(f'[done] took {t_end:.2f}s best C = {best_c}')
return t_end, best_c
print('starting model selection in parallel exploration') tend_seq, best_c_seq = do_gridsearch(n_jobs=1)
tinit = time.time() tend_par, best_c_par = do_gridsearch(n_jobs=-1)
modsel = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True print(tend_seq, best_c_seq)
).fit(*training.Xy) print(tend_par, best_c_par)
tend_par = time.time() - tinit
best_c_par = modsel.best_params_['classifier__C']
print(f'[done] took {tend_par:.2f}s best C = {best_c_par}')
self.assertEqual(best_c_seq, best_c_par) self.assertEqual(best_c_seq, best_c_par)
self.assertLess(tend_par, tend_seq) self.assertLess(tend_par, tend_seq)