going through examples, currently working on second one

This commit is contained in:
Alejandro Moreo Fernandez 2025-06-15 14:57:40 +02:00
parent 5b7f7d4f70
commit 24a91b6e9b
7 changed files with 59 additions and 27 deletions

View File

@ -37,7 +37,7 @@ classifier = LogisticRegression()
pacc = qp.method.aggregative.PACC(classifier)
print(f'training {pacc}')
pacc.fit(train)
pacc.fit(X, y)
# let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
X_test = test.X

View File

@ -12,9 +12,11 @@ In this example, we show how to perform model selection on a DistributionMatchin
model = DMy()
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
f'to increase/decrease the number of jobs use:\n'
f'> N_JOBS=-1 python3 1.model_selection.py\n'
f'alternatively, you can set this variable within the script as:\n'
f'import quapy as qp\n'
f'qp.environ["N_JOBS"]=-1')
@ -50,6 +52,7 @@ with qp.util.temp_seed(0):
tinit = time()
Xtr, ytr = training.Xy
model = qp.model_selection.GridSearchQ(
model=model,
param_grid=param_grid,
@ -58,7 +61,7 @@ with qp.util.temp_seed(0):
refit=False, # retrain on the whole labelled set once done
# raise_errors=False,
verbose=True # show information as the process goes on
).fit(training)
).fit(Xtr, ytr)
tend = time()

View File

@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ
from quapy.method.aggregative import AggregativeSoftQuantifier
from quapy.protocol import APP
import quapy.functional as F
import numpy as np
from sklearn.linear_model import LogisticRegression
from time import time
@ -32,10 +33,11 @@ class MyQuantifier(BaseQuantifier):
# in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
# val_split=None); this would amount to:
def fit(self, data: LabelledCollection):
assert data.n_classes==2, \
def fit(self, X, y):
n_classes = F.num_classes_from_labels(y)
assert n_classes==2, \
'this quantifier is only valid for binary problems [abort]'
self.classifier.fit(*data.Xy)
self.classifier.fit(X, y)
return self
# in general, we would need to implement the method quantify(self, instances); this would amount to:
@ -57,6 +59,7 @@ class MyQuantifier(BaseQuantifier):
# of the method, now adhering to the AggregativeSoftQuantifier:
class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
def __init__(self, classifier, alpha=0.5):
# aggregative quantifiers have an internal attribute called self.classifier
self.classifier = classifier
@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
# k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
# this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
# basic functionality for checking binary consistency.
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
def aggregation_fit(self, classif_predictions, labels):
pass
# since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
@ -94,7 +97,7 @@ if __name__ == '__main__':
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams
def test_implementation(quantifier):
def try_implementation(quantifier):
class_name = quantifier.__class__.__name__
print(f'\ntesting implementation {class_name}...')
# model selection
@ -104,7 +107,7 @@ if __name__ == '__main__':
'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter
'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter
}
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train)
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy)
t_modsel = time() - tinit
print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
@ -112,7 +115,7 @@ if __name__ == '__main__':
optimized_model = gridsearch.best_model_
mae = qp.evaluation.evaluate(
optimized_model,
protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests!
protocol=APP(test, repeats=500, sanity_check=None), # disable the check, we want to generate many tests!
error_metric='mae',
verbose=True)
@ -121,11 +124,11 @@ if __name__ == '__main__':
# define an instance of our custom quantifier and test it!
quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
test_implementation(quantifier)
try_implementation(quantifier)
# define an instance of our custom quantifier, with the second implementation, and test it!
quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
test_implementation(quantifier)
try_implementation(quantifier)
# the output should look like this:
"""

View File

View File

@ -318,6 +318,15 @@ class LabelledCollection:
classes = np.unique(labels).sort()
return LabelledCollection(instances, labels, classes=classes)
@property
def classes(self):
"""
Gets an array-like with the classes used in this collection
:return: array-like
"""
return self.classes_
@property
def Xy(self):
"""

View File

@ -14,13 +14,22 @@ import numpy as np
def classes_from_labels(labels):
"""
Obtains a np.ndarray with the (sorted) classes
:param labels:
:return:
:param labels: array-like with the instances' labels
:return: a sorted np.ndarray with the class labels
"""
classes = np.unique(labels)
classes.sort()
return classes
def num_classes_from_labels(labels):
"""
Obtains the number of classes from an array-like of instance's labels
:param labels: array-like with the instances' labels
:return: int, the number of classes
"""
return len(classes_from_labels(labels))
# ------------------------------------------------------------------------------------------
# Counter utils
# ------------------------------------------------------------------------------------------

View File

@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier):
def job(cls_params):
model.set_params(**cls_params)
predictions = model.classifier_fit_predict(self._training)
predictions = model.classifier_fit_predict(self._training_X, self._training_y)
return predictions
predictions, status, took = self._error_handler(job, cls_params)
@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier):
def job(q_params):
model.set_params(**q_params)
model.aggregation_fit(predictions, self._training)
P, y = predictions
model.aggregation_fit(P, y)
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
return score
@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier):
def job(params):
model.set_params(**params)
model.fit(self._training)
model.fit(self._training_X, self._training_y)
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
return score
@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier):
return False
return True
def _compute_scores_aggregative(self, training):
def _compute_scores_aggregative(self, X, y):
# break down the set of hyperparameters into two: classifier-specific, quantifier-specific
cls_configs, q_configs = group_params(self.param_grid)
# train all classifiers and get the predictions
self._training = training
self._training_X = X
self._training_y = y
cls_outs = qp.util.parallel(
self._prepare_classifier,
cls_configs,
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
n_jobs=self.n_jobs,
asarray=False
)
# filter out classifier configurations that yielded any error
@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier):
return aggr_outs
def _compute_scores_nonaggregative(self, training):
def _compute_scores_nonaggregative(self, X, y):
configs = expand_grid(self.param_grid)
self._training = training
self._training_X = X
self._training_y = y
scores = qp.util.parallel(
self._prepare_nonaggr_model,
configs,
@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier):
else:
self._sout(f'error={status}')
def fit(self, training: LabelledCollection):
def fit(self, X, y):
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
:param X: array-like, training covariates
:param y: array-like, labels of training data
:return: self
"""
@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier):
self._sout(f'starting model selection with n_jobs={self.n_jobs}')
if self._break_down_fit():
results = self._compute_scores_aggregative(training)
results = self._compute_scores_aggregative(X, y)
else:
results = self._compute_scores_nonaggregative(training)
results = self._compute_scores_nonaggregative(X, y)
self.param_scores_ = {}
self.best_score_ = None
@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier):
if isinstance(self.protocol, OnLabelledCollectionProtocol):
tinit = time()
self._sout(f'refitting on the whole development set')
self.best_model_.fit(training + self.protocol.get_labelled_collection())
validation_collection = self.protocol.get_labelled_collection()
training_collection = LabelledCollection(X, y, classes=validation_collection.classes)
devel_collection = training_collection + validation_collection
self.best_model_.fit(*devel_collection.Xy)
tend = time() - tinit
self.refit_time_ = tend
else: