going through examples, currently working on second one
This commit is contained in:
parent
5b7f7d4f70
commit
24a91b6e9b
|
|
@ -37,7 +37,7 @@ classifier = LogisticRegression()
|
|||
pacc = qp.method.aggregative.PACC(classifier)
|
||||
|
||||
print(f'training {pacc}')
|
||||
pacc.fit(train)
|
||||
pacc.fit(X, y)
|
||||
|
||||
# let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
|
||||
X_test = test.X
|
||||
|
|
|
|||
|
|
@ -12,9 +12,11 @@ In this example, we show how to perform model selection on a DistributionMatchin
|
|||
model = DMy()
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
qp.environ['N_JOBS'] = -1
|
||||
|
||||
print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
|
||||
f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
|
||||
f'to increase/decrease the number of jobs use:\n'
|
||||
f'> N_JOBS=-1 python3 1.model_selection.py\n'
|
||||
f'alternatively, you can set this variable within the script as:\n'
|
||||
f'import quapy as qp\n'
|
||||
f'qp.environ["N_JOBS"]=-1')
|
||||
|
|
@ -50,6 +52,7 @@ with qp.util.temp_seed(0):
|
|||
|
||||
tinit = time()
|
||||
|
||||
Xtr, ytr = training.Xy
|
||||
model = qp.model_selection.GridSearchQ(
|
||||
model=model,
|
||||
param_grid=param_grid,
|
||||
|
|
@ -58,7 +61,7 @@ with qp.util.temp_seed(0):
|
|||
refit=False, # retrain on the whole labelled set once done
|
||||
# raise_errors=False,
|
||||
verbose=True # show information as the process goes on
|
||||
).fit(training)
|
||||
).fit(Xtr, ytr)
|
||||
|
||||
tend = time()
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
|||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.method.aggregative import AggregativeSoftQuantifier
|
||||
from quapy.protocol import APP
|
||||
import quapy.functional as F
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from time import time
|
||||
|
|
@ -32,10 +33,11 @@ class MyQuantifier(BaseQuantifier):
|
|||
|
||||
# in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
|
||||
# val_split=None); this would amount to:
|
||||
def fit(self, data: LabelledCollection):
|
||||
assert data.n_classes==2, \
|
||||
def fit(self, X, y):
|
||||
n_classes = F.num_classes_from_labels(y)
|
||||
assert n_classes==2, \
|
||||
'this quantifier is only valid for binary problems [abort]'
|
||||
self.classifier.fit(*data.Xy)
|
||||
self.classifier.fit(X, y)
|
||||
return self
|
||||
|
||||
# in general, we would need to implement the method quantify(self, instances); this would amount to:
|
||||
|
|
@ -57,6 +59,7 @@ class MyQuantifier(BaseQuantifier):
|
|||
# of the method, now adhering to the AggregativeSoftQuantifier:
|
||||
|
||||
class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
|
||||
|
||||
def __init__(self, classifier, alpha=0.5):
|
||||
# aggregative quantifiers have an internal attribute called self.classifier
|
||||
self.classifier = classifier
|
||||
|
|
@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
|
|||
# k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
|
||||
# this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
|
||||
# basic functionality for checking binary consistency.
|
||||
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
pass
|
||||
|
||||
# since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
|
||||
|
|
@ -94,7 +97,7 @@ if __name__ == '__main__':
|
|||
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
||||
train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams
|
||||
|
||||
def test_implementation(quantifier):
|
||||
def try_implementation(quantifier):
|
||||
class_name = quantifier.__class__.__name__
|
||||
print(f'\ntesting implementation {class_name}...')
|
||||
# model selection
|
||||
|
|
@ -104,7 +107,7 @@ if __name__ == '__main__':
|
|||
'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter
|
||||
'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter
|
||||
}
|
||||
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train)
|
||||
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy)
|
||||
t_modsel = time() - tinit
|
||||
print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
|
||||
|
||||
|
|
@ -112,7 +115,7 @@ if __name__ == '__main__':
|
|||
optimized_model = gridsearch.best_model_
|
||||
mae = qp.evaluation.evaluate(
|
||||
optimized_model,
|
||||
protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests!
|
||||
protocol=APP(test, repeats=500, sanity_check=None), # disable the check, we want to generate many tests!
|
||||
error_metric='mae',
|
||||
verbose=True)
|
||||
|
||||
|
|
@ -121,11 +124,11 @@ if __name__ == '__main__':
|
|||
|
||||
# define an instance of our custom quantifier and test it!
|
||||
quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
|
||||
test_implementation(quantifier)
|
||||
try_implementation(quantifier)
|
||||
|
||||
# define an instance of our custom quantifier, with the second implementation, and test it!
|
||||
quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
|
||||
test_implementation(quantifier)
|
||||
try_implementation(quantifier)
|
||||
|
||||
# the output should look like this:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -318,6 +318,15 @@ class LabelledCollection:
|
|||
classes = np.unique(labels).sort()
|
||||
return LabelledCollection(instances, labels, classes=classes)
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
"""
|
||||
Gets an array-like with the classes used in this collection
|
||||
|
||||
:return: array-like
|
||||
"""
|
||||
return self.classes_
|
||||
|
||||
@property
|
||||
def Xy(self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -14,13 +14,22 @@ import numpy as np
|
|||
def classes_from_labels(labels):
|
||||
"""
|
||||
Obtains a np.ndarray with the (sorted) classes
|
||||
:param labels:
|
||||
:return:
|
||||
:param labels: array-like with the instances' labels
|
||||
:return: a sorted np.ndarray with the class labels
|
||||
"""
|
||||
classes = np.unique(labels)
|
||||
classes.sort()
|
||||
return classes
|
||||
|
||||
|
||||
def num_classes_from_labels(labels):
|
||||
"""
|
||||
Obtains the number of classes from an array-like of instance's labels
|
||||
:param labels: array-like with the instances' labels
|
||||
:return: int, the number of classes
|
||||
"""
|
||||
return len(classes_from_labels(labels))
|
||||
|
||||
# ------------------------------------------------------------------------------------------
|
||||
# Counter utils
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
|
||||
def job(cls_params):
|
||||
model.set_params(**cls_params)
|
||||
predictions = model.classifier_fit_predict(self._training)
|
||||
predictions = model.classifier_fit_predict(self._training_X, self._training_y)
|
||||
return predictions
|
||||
|
||||
predictions, status, took = self._error_handler(job, cls_params)
|
||||
|
|
@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier):
|
|||
|
||||
def job(q_params):
|
||||
model.set_params(**q_params)
|
||||
model.aggregation_fit(predictions, self._training)
|
||||
P, y = predictions
|
||||
model.aggregation_fit(P, y)
|
||||
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
|
||||
return score
|
||||
|
||||
|
|
@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
|
||||
def job(params):
|
||||
model.set_params(**params)
|
||||
model.fit(self._training)
|
||||
model.fit(self._training_X, self._training_y)
|
||||
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
|
||||
return score
|
||||
|
||||
|
|
@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier):
|
|||
return False
|
||||
return True
|
||||
|
||||
def _compute_scores_aggregative(self, training):
|
||||
def _compute_scores_aggregative(self, X, y):
|
||||
# break down the set of hyperparameters into two: classifier-specific, quantifier-specific
|
||||
cls_configs, q_configs = group_params(self.param_grid)
|
||||
|
||||
# train all classifiers and get the predictions
|
||||
self._training = training
|
||||
self._training_X = X
|
||||
self._training_y = y
|
||||
cls_outs = qp.util.parallel(
|
||||
self._prepare_classifier,
|
||||
cls_configs,
|
||||
seed=qp.environ.get('_R_SEED', None),
|
||||
n_jobs=self.n_jobs
|
||||
n_jobs=self.n_jobs,
|
||||
asarray=False
|
||||
)
|
||||
|
||||
# filter out classifier configurations that yielded any error
|
||||
|
|
@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier):
|
|||
|
||||
return aggr_outs
|
||||
|
||||
def _compute_scores_nonaggregative(self, training):
|
||||
def _compute_scores_nonaggregative(self, X, y):
|
||||
configs = expand_grid(self.param_grid)
|
||||
self._training = training
|
||||
self._training_X = X
|
||||
self._training_y = y
|
||||
scores = qp.util.parallel(
|
||||
self._prepare_nonaggr_model,
|
||||
configs,
|
||||
|
|
@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier):
|
|||
else:
|
||||
self._sout(f'error={status}')
|
||||
|
||||
def fit(self, training: LabelledCollection):
|
||||
def fit(self, X, y):
|
||||
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
||||
the error metric.
|
||||
|
||||
:param training: the training set on which to optimize the hyperparameters
|
||||
:param X: array-like, training covariates
|
||||
:param y: array-like, labels of training data
|
||||
:return: self
|
||||
"""
|
||||
|
||||
|
|
@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier):
|
|||
|
||||
self._sout(f'starting model selection with n_jobs={self.n_jobs}')
|
||||
if self._break_down_fit():
|
||||
results = self._compute_scores_aggregative(training)
|
||||
results = self._compute_scores_aggregative(X, y)
|
||||
else:
|
||||
results = self._compute_scores_nonaggregative(training)
|
||||
results = self._compute_scores_nonaggregative(X, y)
|
||||
|
||||
self.param_scores_ = {}
|
||||
self.best_score_ = None
|
||||
|
|
@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier):
|
|||
if isinstance(self.protocol, OnLabelledCollectionProtocol):
|
||||
tinit = time()
|
||||
self._sout(f'refitting on the whole development set')
|
||||
self.best_model_.fit(training + self.protocol.get_labelled_collection())
|
||||
validation_collection = self.protocol.get_labelled_collection()
|
||||
training_collection = LabelledCollection(X, y, classes=validation_collection.classes)
|
||||
devel_collection = training_collection + validation_collection
|
||||
self.best_model_.fit(*devel_collection.Xy)
|
||||
tend = time() - tinit
|
||||
self.refit_time_ = tend
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in New Issue