diff --git a/examples/0.basics.py b/examples/0.basics.py index be18109..a5ce67d 100644 --- a/examples/0.basics.py +++ b/examples/0.basics.py @@ -37,7 +37,7 @@ classifier = LogisticRegression() pacc = qp.method.aggregative.PACC(classifier) print(f'training {pacc}') -pacc.fit(train) +pacc.fit(X, y) # let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X) X_test = test.X diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py index 61b7087..94225df 100644 --- a/examples/1.model_selection.py +++ b/examples/1.model_selection.py @@ -12,9 +12,11 @@ In this example, we show how to perform model selection on a DistributionMatchin model = DMy() qp.environ['SAMPLE_SIZE'] = 100 +qp.environ['N_JOBS'] = -1 print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; ' - f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n' + f'to increase/decrease the number of jobs use:\n' + f'> N_JOBS=-1 python3 1.model_selection.py\n' f'alternatively, you can set this variable within the script as:\n' f'import quapy as qp\n' f'qp.environ["N_JOBS"]=-1') @@ -50,6 +52,7 @@ with qp.util.temp_seed(0): tinit = time() + Xtr, ytr = training.Xy model = qp.model_selection.GridSearchQ( model=model, param_grid=param_grid, @@ -58,7 +61,7 @@ with qp.util.temp_seed(0): refit=False, # retrain on the whole labelled set once done # raise_errors=False, verbose=True # show information as the process goes on - ).fit(training) + ).fit(Xtr, ytr) tend = time() diff --git a/examples/2.custom_quantifier.py b/examples/2.custom_quantifier.py index 4f6c627..09fa71f 100644 --- a/examples/2.custom_quantifier.py +++ b/examples/2.custom_quantifier.py @@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.protocol import APP +import quapy.functional as F import numpy as np from sklearn.linear_model import LogisticRegression from time import time @@ -32,10 +33,11 @@ class MyQuantifier(BaseQuantifier): # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True, # val_split=None); this would amount to: - def fit(self, data: LabelledCollection): - assert data.n_classes==2, \ + def fit(self, X, y): + n_classes = F.num_classes_from_labels(y) + assert n_classes==2, \ 'this quantifier is only valid for binary problems [abort]' - self.classifier.fit(*data.Xy) + self.classifier.fit(X, y) return self # in general, we would need to implement the method quantify(self, instances); this would amount to: @@ -57,6 +59,7 @@ class MyQuantifier(BaseQuantifier): # of the method, now adhering to the AggregativeSoftQuantifier: class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): + def __init__(self, classifier, alpha=0.5): # aggregative quantifiers have an internal attribute called self.classifier self.classifier = classifier @@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some # basic functionality for checking binary consistency. - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_predictions, labels): pass # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should @@ -94,7 +97,7 @@ if __name__ == '__main__': train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams - def test_implementation(quantifier): + def try_implementation(quantifier): class_name = quantifier.__class__.__name__ print(f'\ntesting implementation {class_name}...') # model selection @@ -104,7 +107,7 @@ if __name__ == '__main__': 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter } - gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train) + gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy) t_modsel = time() - tinit print(f'\tmodel selection took {t_modsel:.2f}s', flush=True) @@ -112,7 +115,7 @@ if __name__ == '__main__': optimized_model = gridsearch.best_model_ mae = qp.evaluation.evaluate( optimized_model, - protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests! + protocol=APP(test, repeats=500, sanity_check=None), # disable the check, we want to generate many tests! error_metric='mae', verbose=True) @@ -121,11 +124,11 @@ if __name__ == '__main__': # define an instance of our custom quantifier and test it! quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) - test_implementation(quantifier) + try_implementation(quantifier) # define an instance of our custom quantifier, with the second implementation, and test it! quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5) - test_implementation(quantifier) + try_implementation(quantifier) # the output should look like this: """ diff --git a/examples/3.custom_collection.py b/examples/3.custom_collection.py new file mode 100644 index 0000000..e69de29 diff --git a/quapy/data/base.py b/quapy/data/base.py index 72561e4..c22e895 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -318,6 +318,15 @@ class LabelledCollection: classes = np.unique(labels).sort() return LabelledCollection(instances, labels, classes=classes) + @property + def classes(self): + """ + Gets an array-like with the classes used in this collection + + :return: array-like + """ + return self.classes_ + @property def Xy(self): """ diff --git a/quapy/functional.py b/quapy/functional.py index b508d76..2e477e0 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -14,13 +14,22 @@ import numpy as np def classes_from_labels(labels): """ Obtains a np.ndarray with the (sorted) classes - :param labels: - :return: + :param labels: array-like with the instances' labels + :return: a sorted np.ndarray with the class labels """ classes = np.unique(labels) classes.sort() return classes + +def num_classes_from_labels(labels): + """ + Obtains the number of classes from an array-like of instance's labels + :param labels: array-like with the instances' labels + :return: int, the number of classes + """ + return len(classes_from_labels(labels)) + # ------------------------------------------------------------------------------------------ # Counter utils # ------------------------------------------------------------------------------------------ diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 75828ac..c8183f2 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier): def job(cls_params): model.set_params(**cls_params) - predictions = model.classifier_fit_predict(self._training) + predictions = model.classifier_fit_predict(self._training_X, self._training_y) return predictions predictions, status, took = self._error_handler(job, cls_params) @@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier): def job(q_params): model.set_params(**q_params) - model.aggregation_fit(predictions, self._training) + P, y = predictions + model.aggregation_fit(P, y) score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) return score @@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier): def job(params): model.set_params(**params) - model.fit(self._training) + model.fit(self._training_X, self._training_y) score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) return score @@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier): return False return True - def _compute_scores_aggregative(self, training): + def _compute_scores_aggregative(self, X, y): # break down the set of hyperparameters into two: classifier-specific, quantifier-specific cls_configs, q_configs = group_params(self.param_grid) # train all classifiers and get the predictions - self._training = training + self._training_X = X + self._training_y = y cls_outs = qp.util.parallel( self._prepare_classifier, cls_configs, seed=qp.environ.get('_R_SEED', None), - n_jobs=self.n_jobs + n_jobs=self.n_jobs, + asarray=False ) # filter out classifier configurations that yielded any error @@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier): return aggr_outs - def _compute_scores_nonaggregative(self, training): + def _compute_scores_nonaggregative(self, X, y): configs = expand_grid(self.param_grid) - self._training = training + self._training_X = X + self._training_y = y scores = qp.util.parallel( self._prepare_nonaggr_model, configs, @@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier): else: self._sout(f'error={status}') - def fit(self, training: LabelledCollection): + def fit(self, X, y): """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing the error metric. - :param training: the training set on which to optimize the hyperparameters + :param X: array-like, training covariates + :param y: array-like, labels of training data :return: self """ @@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier): self._sout(f'starting model selection with n_jobs={self.n_jobs}') if self._break_down_fit(): - results = self._compute_scores_aggregative(training) + results = self._compute_scores_aggregative(X, y) else: - results = self._compute_scores_nonaggregative(training) + results = self._compute_scores_nonaggregative(X, y) self.param_scores_ = {} self.best_score_ = None @@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier): if isinstance(self.protocol, OnLabelledCollectionProtocol): tinit = time() self._sout(f'refitting on the whole development set') - self.best_model_.fit(training + self.protocol.get_labelled_collection()) + validation_collection = self.protocol.get_labelled_collection() + training_collection = LabelledCollection(X, y, classes=validation_collection.classes) + devel_collection = training_collection + validation_collection + self.best_model_.fit(*devel_collection.Xy) tend = time() - tinit self.refit_time_ = tend else: