going through examples, currently working on second one

2025-06-15 14:57:40 +02:00 · 2025-06-15 14:57:40 +02:00 · 24a91b6e9b
parent 5b7f7d4f70
commit 24a91b6e9b
7 changed files with 59 additions and 27 deletions
--- a/examples/0.basics.py
+++ b/examples/0.basics.py
@ -37,7 +37,7 @@ classifier = LogisticRegression()
 pacc = qp.method.aggregative.PACC(classifier)

 print(f'training {pacc}')
-pacc.fit(train)
+pacc.fit(X, y)

 # let's now test our quantifier on the test data (of course, we should not use the test labels y at this point, only X)
 X_test = test.X
--- a/examples/1.model_selection.py
+++ b/examples/1.model_selection.py
@ -12,9 +12,11 @@ In this example, we show how to perform model selection on a DistributionMatchin
 model = DMy()

 qp.environ['SAMPLE_SIZE'] = 100
+qp.environ['N_JOBS'] = -1

 print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
-      f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
+      f'to increase/decrease the number of jobs use:\n'
+      f'> N_JOBS=-1 python3 1.model_selection.py\n'
      f'alternatively, you can set this variable within the script as:\n'
      f'import quapy as qp\n'
      f'qp.environ["N_JOBS"]=-1')
@ -50,6 +52,7 @@ with qp.util.temp_seed(0):

    tinit = time()

+    Xtr, ytr = training.Xy
    model = qp.model_selection.GridSearchQ(
        model=model,
        param_grid=param_grid,
@ -58,7 +61,7 @@ with qp.util.temp_seed(0):
        refit=False,   # retrain on the whole labelled set once done
        # raise_errors=False,
        verbose=True  # show information as the process goes on
-    ).fit(training)
+    ).fit(Xtr, ytr)

 tend = time()

--- a/examples/2.custom_quantifier.py
+++ b/examples/2.custom_quantifier.py
@ -4,6 +4,7 @@ from quapy.method.base import BinaryQuantifier, BaseQuantifier
 from quapy.model_selection import GridSearchQ
 from quapy.method.aggregative import AggregativeSoftQuantifier
 from quapy.protocol import APP
+import quapy.functional as F
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from time import time
@ -32,10 +33,11 @@ class MyQuantifier(BaseQuantifier):

    # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
    # val_split=None); this would amount to:
-    def fit(self, data: LabelledCollection):
-        assert data.n_classes==2, \
+    def fit(self, X, y):
+        n_classes = F.num_classes_from_labels(y)
+        assert n_classes==2, \
            'this quantifier is only valid for binary problems [abort]'
-        self.classifier.fit(*data.Xy)
+        self.classifier.fit(X, y)
        return self

    # in general, we would need to implement the method quantify(self, instances); this would amount to:
@ -57,6 +59,7 @@ class MyQuantifier(BaseQuantifier):
 # of the method, now adhering to the AggregativeSoftQuantifier:

 class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
+
    def __init__(self, classifier, alpha=0.5):
        # aggregative quantifiers have an internal attribute called self.classifier
        self.classifier = classifier
@ -68,7 +71,7 @@ class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
    # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
    # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
    # basic functionality for checking binary consistency.
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+    def aggregation_fit(self, classif_predictions, labels):
        pass

    # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
@ -94,7 +97,7 @@ if __name__ == '__main__':
    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
    train, val = train.split_stratified(train_prop=0.75)  # let's create a validation set for optimizing hyperparams

-    def test_implementation(quantifier):
+    def try_implementation(quantifier):
        class_name = quantifier.__class__.__name__
        print(f'\ntesting implementation {class_name}...')
        # model selection
@ -104,7 +107,7 @@ if __name__ == '__main__':
            'alpha': np.linspace(0, 1, 11),         # quantifier-dependent hyperparameter
            'classifier__C': np.logspace(-2, 2, 5)  # classifier-dependent hyperparameter
        }
-        gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train)
+        gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(*train.Xy)
        t_modsel = time() - tinit
        print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)

@ -112,7 +115,7 @@ if __name__ == '__main__':
        optimized_model = gridsearch.best_model_
        mae = qp.evaluation.evaluate(
            optimized_model,
-            protocol=APP(test, repeats=5000, sanity_check=None),  # disable the check, we want to generate many tests!
+            protocol=APP(test, repeats=500, sanity_check=None),  # disable the check, we want to generate many tests!
            error_metric='mae',
            verbose=True)

@ -121,11 +124,11 @@ if __name__ == '__main__':

    # define an instance of our custom quantifier and test it!
    quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
-    test_implementation(quantifier)
+    try_implementation(quantifier)

    # define an instance of our custom quantifier, with the second implementation, and test it!
    quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
-    test_implementation(quantifier)
+    try_implementation(quantifier)

    # the output should look like this:
    """
--- a/examples/3.custom_collection.py
+++ b/examples/3.custom_collection.py
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -318,6 +318,15 @@ class LabelledCollection:
        classes = np.unique(labels).sort()
        return LabelledCollection(instances, labels, classes=classes)

+    @property
+    def classes(self):
+        """
+        Gets an array-like with the classes used in this collection
+
+        :return: array-like
+        """
+        return self.classes_
+
    @property
    def Xy(self):
        """
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -14,13 +14,22 @@ import numpy as np
 def classes_from_labels(labels):
    """
    Obtains a np.ndarray with the (sorted) classes
-    :param labels:
-    :return:
+    :param labels: array-like with the instances' labels
+    :return: a sorted np.ndarray with the class labels
    """
    classes = np.unique(labels)
    classes.sort()
    return classes

+
+def num_classes_from_labels(labels):
+    """
+    Obtains the number of classes from an array-like of instance's labels
+    :param labels: array-like with the instances' labels
+    :return: int, the number of classes
+    """
+    return len(classes_from_labels(labels))
+
 # ------------------------------------------------------------------------------------------
 # Counter utils
 # ------------------------------------------------------------------------------------------
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -109,7 +109,7 @@ class GridSearchQ(BaseQuantifier):

        def job(cls_params):
            model.set_params(**cls_params)
-            predictions = model.classifier_fit_predict(self._training)
+            predictions = model.classifier_fit_predict(self._training_X, self._training_y)
            return predictions

        predictions, status, took = self._error_handler(job, cls_params)
@ -123,7 +123,8 @@ class GridSearchQ(BaseQuantifier):

        def job(q_params):
            model.set_params(**q_params)
-            model.aggregation_fit(predictions, self._training)
+            P, y = predictions
+            model.aggregation_fit(P, y)
            score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
            return score

@ -136,7 +137,7 @@ class GridSearchQ(BaseQuantifier):

        def job(params):
            model.set_params(**params)
-            model.fit(self._training)
+            model.fit(self._training_X, self._training_y)
            score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
            return score

@ -159,17 +160,19 @@ class GridSearchQ(BaseQuantifier):
            return False
        return True

-    def _compute_scores_aggregative(self, training):
+    def _compute_scores_aggregative(self, X, y):
        # break down the set of hyperparameters into two: classifier-specific, quantifier-specific
        cls_configs, q_configs = group_params(self.param_grid)

        # train all classifiers and get the predictions
-        self._training = training
+        self._training_X = X
+        self._training_y = y
        cls_outs = qp.util.parallel(
            self._prepare_classifier,
            cls_configs,
            seed=qp.environ.get('_R_SEED', None),
-            n_jobs=self.n_jobs
+            n_jobs=self.n_jobs,
+            asarray=False
        )

        # filter out classifier configurations that yielded any error
@ -194,9 +197,10 @@ class GridSearchQ(BaseQuantifier):

        return aggr_outs

-    def _compute_scores_nonaggregative(self, training):
+    def _compute_scores_nonaggregative(self, X, y):
        configs = expand_grid(self.param_grid)
-        self._training = training
+        self._training_X = X
+        self._training_y = y
        scores = qp.util.parallel(
            self._prepare_nonaggr_model,
            configs,
@ -211,11 +215,12 @@ class GridSearchQ(BaseQuantifier):
        else:
            self._sout(f'error={status}')

-    def fit(self, training: LabelledCollection):
+    def fit(self, X, y):
        """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
            the error metric.

-        :param training: the training set on which to optimize the hyperparameters
+        :param X: array-like, training covariates
+        :param y: array-like, labels of training data
        :return: self
        """

@ -231,9 +236,9 @@ class GridSearchQ(BaseQuantifier):

        self._sout(f'starting model selection with n_jobs={self.n_jobs}')
        if self._break_down_fit():
-            results = self._compute_scores_aggregative(training)
+            results = self._compute_scores_aggregative(X, y)
        else:
-            results = self._compute_scores_nonaggregative(training)
+            results = self._compute_scores_nonaggregative(X, y)

        self.param_scores_ = {}
        self.best_score_ = None
@ -266,7 +271,10 @@ class GridSearchQ(BaseQuantifier):
            if isinstance(self.protocol, OnLabelledCollectionProtocol):
                tinit = time()
                self._sout(f'refitting on the whole development set')
-                self.best_model_.fit(training + self.protocol.get_labelled_collection())
+                validation_collection = self.protocol.get_labelled_collection()
+                training_collection = LabelledCollection(X, y, classes=validation_collection.classes)
+                devel_collection = training_collection + validation_collection
+                self.best_model_.fit(*devel_collection.Xy)
                tend = time() - tinit
                self.refit_time_ = tend
            else: