going through examples, currently working on second one
This commit is contained in:
parent
5b7f7d4f70
commit
24a91b6e9b
|
|
@ -37,7 +37,7 @@ classifier = LogisticRegression()
|
||||||
pacc = qp.method.aggregative.PACC(classifier)
|
pacc = qp.method.aggregative.PACC(classifier)
|
||||||
|
|
||||||
print(f'training {pacc}')
|
print(f'training {pacc}')
|
||||||
pacc.fit(train)
|
pacc.fit(X, y)
|
||||||
|
|
||||||
# let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
|
# let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
|
||||||
X_test = test.X
|
X_test = test.X
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,11 @@ In this example, we show how to perform model selection on a DistributionMatchin
|
||||||
model = DMy()
|
model = DMy()
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = 100
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
|
||||||
print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
|
print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
|
||||||
f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
|
f'to increase/decrease the number of jobs use:\n'
|
||||||
|
f'> N_JOBS=-1 python3 1.model_selection.py\n'
|
||||||
f'alternatively, you can set this variable within the script as:\n'
|
f'alternatively, you can set this variable within the script as:\n'
|
||||||
f'import quapy as qp\n'
|
f'import quapy as qp\n'
|
||||||
f'qp.environ["N_JOBS"]=-1')
|
f'qp.environ["N_JOBS"]=-1')
|
||||||
|
|
@ -50,6 +52,7 @@ with qp.util.temp_seed(0):
|
||||||
|
|
||||||
tinit = time()
|
tinit = time()
|
||||||
|
|
||||||
|
Xtr, ytr = training.Xy
|
||||||
model = qp.model_selection.GridSearchQ(
|
model = qp.model_selection.GridSearchQ(
|
||||||
model=model,
|
model=model,
|
||||||
param_grid=param_grid,
|
param_grid=param_grid,
|
||||||
|
|
@ -58,7 +61,7 @@ with qp.util.temp_seed(0):
|
||||||
refit=False, # retrain on the whole labelled set once done
|
refit=False, # retrain on the whole labelled set once done
|
||||||
# raise_errors=False,
|
# raise_errors=False,
|
||||||
verbose=True # show information as the process goes on
|
verbose=True # show information as the process goes on
|
||||||
).fit(training)
|
).fit(Xtr, ytr)
|
||||||
|
|
||||||
tend = time()
|
tend = time()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
||||||
from quapy.model_selection import GridSearchQ
|
from quapy.model_selection import GridSearchQ
|
||||||
from quapy.method.aggregative import AggregativeSoftQuantifier
|
from quapy.method.aggregative import AggregativeSoftQuantifier
|
||||||
from quapy.protocol import APP
|
from quapy.protocol import APP
|
||||||
|
import quapy.functional as F
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from time import time
|
from time import time
|
||||||
|
|
@ -32,10 +33,11 @@ class MyQuantifier(BaseQuantifier):
|
||||||
|
|
||||||
# in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
|
# in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
|
||||||
# val_split=None); this would amount to:
|
# val_split=None); this would amount to:
|
||||||
def fit(self, data: LabelledCollection):
|
def fit(self, X, y):
|
||||||
assert data.n_classes==2, \
|
n_classes = F.num_classes_from_labels(y)
|
||||||
|
assert n_classes==2, \
|
||||||
'this quantifier is only valid for binary problems [abort]'
|
'this quantifier is only valid for binary problems [abort]'
|
||||||
self.classifier.fit(*data.Xy)
|
self.classifier.fit(X, y)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
# in general, we would need to implement the method quantify(self, instances); this would amount to:
|
# in general, we would need to implement the method quantify(self, instances); this would amount to:
|
||||||
|
|
@ -57,6 +59,7 @@ class MyQuantifier(BaseQuantifier):
|
||||||
# of the method, now adhering to the AggregativeSoftQuantifier:
|
# of the method, now adhering to the AggregativeSoftQuantifier:
|
||||||
|
|
||||||
class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
|
class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
def __init__(self, classifier, alpha=0.5):
|
def __init__(self, classifier, alpha=0.5):
|
||||||
# aggregative quantifiers have an internal attribute called self.classifier
|
# aggregative quantifiers have an internal attribute called self.classifier
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
|
|
@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
|
||||||
# k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
|
# k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
|
||||||
# this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
|
# this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
|
||||||
# basic functionality for checking binary consistency.
|
# basic functionality for checking binary consistency.
|
||||||
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
def aggregation_fit(self, classif_predictions, labels):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
|
# since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
|
||||||
|
|
@ -94,7 +97,7 @@ if __name__ == '__main__':
|
||||||
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
||||||
train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams
|
train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams
|
||||||
|
|
||||||
def test_implementation(quantifier):
|
def try_implementation(quantifier):
|
||||||
class_name = quantifier.__class__.__name__
|
class_name = quantifier.__class__.__name__
|
||||||
print(f'\ntesting implementation {class_name}...')
|
print(f'\ntesting implementation {class_name}...')
|
||||||
# model selection
|
# model selection
|
||||||
|
|
@ -104,7 +107,7 @@ if __name__ == '__main__':
|
||||||
'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter
|
'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter
|
||||||
'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter
|
'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter
|
||||||
}
|
}
|
||||||
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train)
|
gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy)
|
||||||
t_modsel = time() - tinit
|
t_modsel = time() - tinit
|
||||||
print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
|
print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
|
||||||
|
|
||||||
|
|
@ -112,7 +115,7 @@ if __name__ == '__main__':
|
||||||
optimized_model = gridsearch.best_model_
|
optimized_model = gridsearch.best_model_
|
||||||
mae = qp.evaluation.evaluate(
|
mae = qp.evaluation.evaluate(
|
||||||
optimized_model,
|
optimized_model,
|
||||||
protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests!
|
protocol=APP(test, repeats=500, sanity_check=None), # disable the check, we want to generate many tests!
|
||||||
error_metric='mae',
|
error_metric='mae',
|
||||||
verbose=True)
|
verbose=True)
|
||||||
|
|
||||||
|
|
@ -121,11 +124,11 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# define an instance of our custom quantifier and test it!
|
# define an instance of our custom quantifier and test it!
|
||||||
quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
|
quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
|
||||||
test_implementation(quantifier)
|
try_implementation(quantifier)
|
||||||
|
|
||||||
# define an instance of our custom quantifier, with the second implementation, and test it!
|
# define an instance of our custom quantifier, with the second implementation, and test it!
|
||||||
quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
|
quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
|
||||||
test_implementation(quantifier)
|
try_implementation(quantifier)
|
||||||
|
|
||||||
# the output should look like this:
|
# the output should look like this:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -318,6 +318,15 @@ class LabelledCollection:
|
||||||
classes = np.unique(labels).sort()
|
classes = np.unique(labels).sort()
|
||||||
return LabelledCollection(instances, labels, classes=classes)
|
return LabelledCollection(instances, labels, classes=classes)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes(self):
|
||||||
|
"""
|
||||||
|
Gets an array-like with the classes used in this collection
|
||||||
|
|
||||||
|
:return: array-like
|
||||||
|
"""
|
||||||
|
return self.classes_
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def Xy(self):
|
def Xy(self):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -14,13 +14,22 @@ import numpy as np
|
||||||
def classes_from_labels(labels):
|
def classes_from_labels(labels):
|
||||||
"""
|
"""
|
||||||
Obtains a np.ndarray with the (sorted) classes
|
Obtains a np.ndarray with the (sorted) classes
|
||||||
:param labels:
|
:param labels: array-like with the instances' labels
|
||||||
:return:
|
:return: a sorted np.ndarray with the class labels
|
||||||
"""
|
"""
|
||||||
classes = np.unique(labels)
|
classes = np.unique(labels)
|
||||||
classes.sort()
|
classes.sort()
|
||||||
return classes
|
return classes
|
||||||
|
|
||||||
|
|
||||||
|
def num_classes_from_labels(labels):
|
||||||
|
"""
|
||||||
|
Obtains the number of classes from an array-like of instance's labels
|
||||||
|
:param labels: array-like with the instances' labels
|
||||||
|
:return: int, the number of classes
|
||||||
|
"""
|
||||||
|
return len(classes_from_labels(labels))
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------
|
||||||
# Counter utils
|
# Counter utils
|
||||||
# ------------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
def job(cls_params):
|
def job(cls_params):
|
||||||
model.set_params(**cls_params)
|
model.set_params(**cls_params)
|
||||||
predictions = model.classifier_fit_predict(self._training)
|
predictions = model.classifier_fit_predict(self._training_X, self._training_y)
|
||||||
return predictions
|
return predictions
|
||||||
|
|
||||||
predictions, status, took = self._error_handler(job, cls_params)
|
predictions, status, took = self._error_handler(job, cls_params)
|
||||||
|
|
@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
def job(q_params):
|
def job(q_params):
|
||||||
model.set_params(**q_params)
|
model.set_params(**q_params)
|
||||||
model.aggregation_fit(predictions, self._training)
|
P, y = predictions
|
||||||
|
model.aggregation_fit(P, y)
|
||||||
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
|
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
def job(params):
|
def job(params):
|
||||||
model.set_params(**params)
|
model.set_params(**params)
|
||||||
model.fit(self._training)
|
model.fit(self._training_X, self._training_y)
|
||||||
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
|
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _compute_scores_aggregative(self, training):
|
def _compute_scores_aggregative(self, X, y):
|
||||||
# break down the set of hyperparameters into two: classifier-specific, quantifier-specific
|
# break down the set of hyperparameters into two: classifier-specific, quantifier-specific
|
||||||
cls_configs, q_configs = group_params(self.param_grid)
|
cls_configs, q_configs = group_params(self.param_grid)
|
||||||
|
|
||||||
# train all classifiers and get the predictions
|
# train all classifiers and get the predictions
|
||||||
self._training = training
|
self._training_X = X
|
||||||
|
self._training_y = y
|
||||||
cls_outs = qp.util.parallel(
|
cls_outs = qp.util.parallel(
|
||||||
self._prepare_classifier,
|
self._prepare_classifier,
|
||||||
cls_configs,
|
cls_configs,
|
||||||
seed=qp.environ.get('_R_SEED', None),
|
seed=qp.environ.get('_R_SEED', None),
|
||||||
n_jobs=self.n_jobs
|
n_jobs=self.n_jobs,
|
||||||
|
asarray=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# filter out classifier configurations that yielded any error
|
# filter out classifier configurations that yielded any error
|
||||||
|
|
@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
return aggr_outs
|
return aggr_outs
|
||||||
|
|
||||||
def _compute_scores_nonaggregative(self, training):
|
def _compute_scores_nonaggregative(self, X, y):
|
||||||
configs = expand_grid(self.param_grid)
|
configs = expand_grid(self.param_grid)
|
||||||
self._training = training
|
self._training_X = X
|
||||||
|
self._training_y = y
|
||||||
scores = qp.util.parallel(
|
scores = qp.util.parallel(
|
||||||
self._prepare_nonaggr_model,
|
self._prepare_nonaggr_model,
|
||||||
configs,
|
configs,
|
||||||
|
|
@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier):
|
||||||
else:
|
else:
|
||||||
self._sout(f'error={status}')
|
self._sout(f'error={status}')
|
||||||
|
|
||||||
def fit(self, training: LabelledCollection):
|
def fit(self, X, y):
|
||||||
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
||||||
the error metric.
|
the error metric.
|
||||||
|
|
||||||
:param training: the training set on which to optimize the hyperparameters
|
:param X: array-like, training covariates
|
||||||
|
:param y: array-like, labels of training data
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
self._sout(f'starting model selection with n_jobs={self.n_jobs}')
|
self._sout(f'starting model selection with n_jobs={self.n_jobs}')
|
||||||
if self._break_down_fit():
|
if self._break_down_fit():
|
||||||
results = self._compute_scores_aggregative(training)
|
results = self._compute_scores_aggregative(X, y)
|
||||||
else:
|
else:
|
||||||
results = self._compute_scores_nonaggregative(training)
|
results = self._compute_scores_nonaggregative(X, y)
|
||||||
|
|
||||||
self.param_scores_ = {}
|
self.param_scores_ = {}
|
||||||
self.best_score_ = None
|
self.best_score_ = None
|
||||||
|
|
@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier):
|
||||||
if isinstance(self.protocol, OnLabelledCollectionProtocol):
|
if isinstance(self.protocol, OnLabelledCollectionProtocol):
|
||||||
tinit = time()
|
tinit = time()
|
||||||
self._sout(f'refitting on the whole development set')
|
self._sout(f'refitting on the whole development set')
|
||||||
self.best_model_.fit(training + self.protocol.get_labelled_collection())
|
validation_collection = self.protocol.get_labelled_collection()
|
||||||
|
training_collection = LabelledCollection(X, y, classes=validation_collection.classes)
|
||||||
|
devel_collection = training_collection + validation_collection
|
||||||
|
self.best_model_.fit(*devel_collection.Xy)
|
||||||
tend = time() - tinit
|
tend = time() - tinit
|
||||||
self.refit_time_ = tend
|
self.refit_time_ = tend
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue