going through examples, currently working on second one

This commit is contained in:
Alejandro Moreo Fernandez 2025-06-15 14:57:40 +02:00
parent 5b7f7d4f70
commit 24a91b6e9b
7 changed files with 59 additions and 27 deletions

View File

@ -37,7 +37,7 @@ classifier = LogisticRegression()
pacc = qp.method.aggregative.PACC(classifier) pacc = qp.method.aggregative.PACC(classifier)
print(f'training {pacc}') print(f'training {pacc}')
pacc.fit(train) pacc.fit(X, y)
# let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X) # let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
X_test = test.X X_test = test.X

View File

@ -12,9 +12,11 @@ In this example, we show how to perform model selection on a DistributionMatchin
model = DMy() model = DMy()
qp.environ['SAMPLE_SIZE'] = 100 qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; ' print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n' f'to increase/decrease the number of jobs use:\n'
f'> N_JOBS=-1 python3 1.model_selection.py\n'
f'alternatively, you can set this variable within the script as:\n' f'alternatively, you can set this variable within the script as:\n'
f'import quapy as qp\n' f'import quapy as qp\n'
f'qp.environ["N_JOBS"]=-1') f'qp.environ["N_JOBS"]=-1')
@ -50,6 +52,7 @@ with qp.util.temp_seed(0):
tinit = time() tinit = time()
Xtr, ytr = training.Xy
model = qp.model_selection.GridSearchQ( model = qp.model_selection.GridSearchQ(
model=model, model=model,
param_grid=param_grid, param_grid=param_grid,
@ -58,7 +61,7 @@ with qp.util.temp_seed(0):
refit=False, # retrain on the whole labelled set once done refit=False, # retrain on the whole labelled set once done
# raise_errors=False, # raise_errors=False,
verbose=True # show information as the process goes on verbose=True # show information as the process goes on
).fit(training) ).fit(Xtr, ytr)
tend = time() tend = time()

View File

@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.method.aggregative import AggregativeSoftQuantifier
from quapy.protocol import APP from quapy.protocol import APP
import quapy.functional as F
import numpy as np import numpy as np
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from time import time from time import time
@ -32,10 +33,11 @@ class MyQuantifier(BaseQuantifier):
# in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True, # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
# val_split=None); this would amount to: # val_split=None); this would amount to:
def fit(self, data: LabelledCollection): def fit(self, X, y):
assert data.n_classes==2, \ n_classes = F.num_classes_from_labels(y)
assert n_classes==2, \
'this quantifier is only valid for binary problems [abort]' 'this quantifier is only valid for binary problems [abort]'
self.classifier.fit(*data.Xy) self.classifier.fit(X, y)
return self return self
# in general, we would need to implement the method quantify(self, instances); this would amount to: # in general, we would need to implement the method quantify(self, instances); this would amount to:
@ -57,6 +59,7 @@ class MyQuantifier(BaseQuantifier):
# of the method, now adhering to the AggregativeSoftQuantifier: # of the method, now adhering to the AggregativeSoftQuantifier:
class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
def __init__(self, classifier, alpha=0.5): def __init__(self, classifier, alpha=0.5):
# aggregative quantifiers have an internal attribute called self.classifier # aggregative quantifiers have an internal attribute called self.classifier
self.classifier = classifier self.classifier = classifier
@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
# k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
# this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
# basic functionality for checking binary consistency. # basic functionality for checking binary consistency.
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions, labels):
pass pass
# since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
@ -94,7 +97,7 @@ if __name__ == '__main__':
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams
def test_implementation(quantifier): def try_implementation(quantifier):
class_name = quantifier.__class__.__name__ class_name = quantifier.__class__.__name__
print(f'\ntesting implementation {class_name}...') print(f'\ntesting implementation {class_name}...')
# model selection # model selection
@ -104,7 +107,7 @@ if __name__ == '__main__':
'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter
'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter
} }
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train) gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy)
t_modsel = time() - tinit t_modsel = time() - tinit
print(f'\tmodel selection took {t_modsel:.2f}s', flush=True) print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
@ -112,7 +115,7 @@ if __name__ == '__main__':
optimized_model = gridsearch.best_model_ optimized_model = gridsearch.best_model_
mae = qp.evaluation.evaluate( mae = qp.evaluation.evaluate(
optimized_model, optimized_model,
protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests! protocol=APP(test, repeats=500, sanity_check=None), # disable the check, we want to generate many tests!
error_metric='mae', error_metric='mae',
verbose=True) verbose=True)
@ -121,11 +124,11 @@ if __name__ == '__main__':
# define an instance of our custom quantifier and test it! # define an instance of our custom quantifier and test it!
quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
test_implementation(quantifier) try_implementation(quantifier)
# define an instance of our custom quantifier, with the second implementation, and test it! # define an instance of our custom quantifier, with the second implementation, and test it!
quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5) quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
test_implementation(quantifier) try_implementation(quantifier)
# the output should look like this: # the output should look like this:
""" """

View File

View File

@ -318,6 +318,15 @@ class LabelledCollection:
classes = np.unique(labels).sort() classes = np.unique(labels).sort()
return LabelledCollection(instances, labels, classes=classes) return LabelledCollection(instances, labels, classes=classes)
@property
def classes(self):
"""
Gets an array-like with the classes used in this collection
:return: array-like
"""
return self.classes_
@property @property
def Xy(self): def Xy(self):
""" """

View File

@ -14,13 +14,22 @@ import numpy as np
def classes_from_labels(labels): def classes_from_labels(labels):
""" """
Obtains a np.ndarray with the (sorted) classes Obtains a np.ndarray with the (sorted) classes
:param labels: :param labels: array-like with the instances' labels
:return: :return: a sorted np.ndarray with the class labels
""" """
classes = np.unique(labels) classes = np.unique(labels)
classes.sort() classes.sort()
return classes return classes
def num_classes_from_labels(labels):
"""
Obtains the number of classes from an array-like of instance's labels
:param labels: array-like with the instances' labels
:return: int, the number of classes
"""
return len(classes_from_labels(labels))
# ------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------
# Counter utils # Counter utils
# ------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------

View File

@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier):
def job(cls_params): def job(cls_params):
model.set_params(**cls_params) model.set_params(**cls_params)
predictions = model.classifier_fit_predict(self._training) predictions = model.classifier_fit_predict(self._training_X, self._training_y)
return predictions return predictions
predictions, status, took = self._error_handler(job, cls_params) predictions, status, took = self._error_handler(job, cls_params)
@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier):
def job(q_params): def job(q_params):
model.set_params(**q_params) model.set_params(**q_params)
model.aggregation_fit(predictions, self._training) P, y = predictions
model.aggregation_fit(P, y)
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
return score return score
@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier):
def job(params): def job(params):
model.set_params(**params) model.set_params(**params)
model.fit(self._training) model.fit(self._training_X, self._training_y)
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
return score return score
@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier):
return False return False
return True return True
def _compute_scores_aggregative(self, training): def _compute_scores_aggregative(self, X, y):
# break down the set of hyperparameters into two: classifier-specific, quantifier-specific # break down the set of hyperparameters into two: classifier-specific, quantifier-specific
cls_configs, q_configs = group_params(self.param_grid) cls_configs, q_configs = group_params(self.param_grid)
# train all classifiers and get the predictions # train all classifiers and get the predictions
self._training = training self._training_X = X
self._training_y = y
cls_outs = qp.util.parallel( cls_outs = qp.util.parallel(
self._prepare_classifier, self._prepare_classifier,
cls_configs, cls_configs,
seed=qp.environ.get('_R_SEED', None), seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs n_jobs=self.n_jobs,
asarray=False
) )
# filter out classifier configurations that yielded any error # filter out classifier configurations that yielded any error
@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier):
return aggr_outs return aggr_outs
def _compute_scores_nonaggregative(self, training): def _compute_scores_nonaggregative(self, X, y):
configs = expand_grid(self.param_grid) configs = expand_grid(self.param_grid)
self._training = training self._training_X = X
self._training_y = y
scores = qp.util.parallel( scores = qp.util.parallel(
self._prepare_nonaggr_model, self._prepare_nonaggr_model,
configs, configs,
@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier):
else: else:
self._sout(f'error={status}') self._sout(f'error={status}')
def fit(self, training: LabelledCollection): def fit(self, X, y):
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric. the error metric.
:param training: the training set on which to optimize the hyperparameters :param X: array-like, training covariates
:param y: array-like, labels of training data
:return: self :return: self
""" """
@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier):
self._sout(f'starting model selection with n_jobs={self.n_jobs}') self._sout(f'starting model selection with n_jobs={self.n_jobs}')
if self._break_down_fit(): if self._break_down_fit():
results = self._compute_scores_aggregative(training) results = self._compute_scores_aggregative(X, y)
else: else:
results = self._compute_scores_nonaggregative(training) results = self._compute_scores_nonaggregative(X, y)
self.param_scores_ = {} self.param_scores_ = {}
self.best_score_ = None self.best_score_ = None
@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier):
if isinstance(self.protocol, OnLabelledCollectionProtocol): if isinstance(self.protocol, OnLabelledCollectionProtocol):
tinit = time() tinit = time()
self._sout(f'refitting on the whole development set') self._sout(f'refitting on the whole development set')
self.best_model_.fit(training + self.protocol.get_labelled_collection()) validation_collection = self.protocol.get_labelled_collection()
training_collection = LabelledCollection(X, y, classes=validation_collection.classes)
devel_collection = training_collection + validation_collection
self.best_model_.fit(*devel_collection.Xy)
tend = time() - tinit tend = time() - tinit
self.refit_time_ = tend self.refit_time_ = tend
else: else: