cleaning examples

2025-09-26 15:22:41 +02:00 · 2025-09-26 15:22:41 +02:00 · 3c16536b3d
parent bf71aecf91
commit 3c16536b3d
7 changed files with 36 additions and 28 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,4 +1,5 @@
 Adapt examples; remaining: example 4-onwards
+not working: 4, 4b, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16

 Add 'platt' to calib options in EMQ?

@ -8,7 +9,7 @@ Update READMEs, wiki, & examples for new fit-predict interface

 Add the fix suggested by Alexander:

-For a more general application, I would maybe first stablish a per-class threshold value of plausible prevalence
+For a more general application, I would maybe first establish a per-class threshold value of plausible prevalence
 based on the number of actual positives and the required sample size; e.g., for sample_size=100 and actual
 positives [10, 100, 500] -> [0.1, 1.0, 1.0], meaning that class 0 can be sampled at most at 0.1 prevalence, while
 the others can be sampled up to 1. prevalence. Then, when a prevalence value is requested, e.g., [0.33, 0.33, 0.33],
--- a/examples/4.lequa2022_experiments.py
+++ b/examples/4.lequa2022_experiments.py
@ -37,7 +37,7 @@ quantifier = EMQ(classifier=LogisticRegression())
 param_grid = {
    'classifier__C': np.logspace(-3, 3, 7),          # classifier-dependent: inverse of regularization strength
    'classifier__class_weight': ['balanced', None],  # classifier-dependent: weights of each class
-    'calib': ['bcts', None]                 # quantifier-dependent: recalibration method (new in v0.1.7)
+    # 'calib': ['bcts', None]                 # quantifier-dependent: recalibration method (new in v0.1.7)
 }
 model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
 quantifier = model_selection.fit(Xtr, ytr)
@ -51,4 +51,4 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev)
 print(report)

 print('Averaged values:')
-print(report.mean())
+print(report.mean(numeric_only=True))
--- a/examples/5.explicit_loss_minimization.py
+++ b/examples/5.explicit_loss_minimization.py
@ -50,7 +50,7 @@ train_modsel, val = qp.datasets.fetch_twitter('hcr', for_model_selection=True, p
 model selection: 
 We explore the classifier's loss and the classifier's C hyperparameters.
 Since our model is actually an instance of OneVsAllAggregative, we need to add the prefix "binary_quantifier", and
-since our binary quantifier is an instance of CC, we need to add the prefix "classifier".
+since our binary quantifier is an instance of CC (an aggregative quantifier), we need to add the prefix "classifier".
 """
 param_grid = {
    'binary_quantifier__classifier__loss': ['q', 'kld', 'mae'],  # classifier-dependent hyperparameter
--- a/examples/6.quanet_example.py
+++ b/examples/6.quanet_example.py
@ -20,11 +20,10 @@ train, test = dataset.train_test
 # train the text classifier:
 cnn_module = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
 cnn_classifier = NeuralClassifierTrainer(cnn_module, device='cuda')
-cnn_classifier.fit(*dataset.training.Xy)

 # train QuaNet (alternatively, we can set fit_classifier=True and let QuaNet train the classifier)
 quantifier = QuaNet(cnn_classifier, device='cuda')
-quantifier.fit(train, fit_classifier=False)
+quantifier.fit(*train.Xy)

 # prediction and evaluation
 estim_prevalence = quantifier.predict(test.instances)
--- a/examples/7.uci_experiments.py
+++ b/examples/7.uci_experiments.py
@ -50,7 +50,7 @@ def quantification_models():
    yield 'MAX', MAX(newLR()), lr_params
    yield 'MS', MS(newLR()), lr_params
    yield 'MS2', MS2(newLR()), lr_params
-    yield 'sldc', EMQ(newLR(), calib='platt'), lr_params
+    yield 'sldc', EMQ(newLR()), lr_params
    yield 'svmmae', newSVMAE(), svmperf_params
    yield 'hdy', HDy(newLR()), lr_params

@ -98,8 +98,8 @@ def run(experiment):
        print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
        # model selection (hyperparameter optimization for a quantification-oriented loss)
        train, test = data.train_test
-        train, val = train.split_stratified()
        if hyperparams is not None:
+            train, val = train.split_stratified()
            model_selection = qp.model_selection.GridSearchQ(
                deepcopy(model),
                param_grid=hyperparams,
@ -109,11 +109,11 @@ def run(experiment):
                timeout=60*60,
                verbose=True
            )
-            model_selection.fit(train)
+            model_selection.fit(*train.Xy)
            model = model_selection.best_model()
            best_params = model_selection.best_params_
        else:
-            model.fit(data.training)
+            model.fit(*train.Xy)
            best_params = {}

        # model evaluation
@ -121,19 +121,19 @@ def run(experiment):
            model,
            protocol=APP(test, n_prevalences=21, repeats=100)
        )
-        test_true_prevalence = data.test.prevalence()
+        test_true_prevalence = test.prevalence()

        evaluate_experiment(true_prevalences, estim_prevalences)
        save_results(dataset_name, model_name, run, optim_loss,
                     true_prevalences, estim_prevalences,
-                     data.training.prevalence(), test_true_prevalence,
+                     train.prevalence(), test_true_prevalence,
                     best_params)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
-    parser.add_argument('results', metavar='RESULT_PATH', type=str,
-                        help='path to the directory where to store the results')
+    parser.add_argument('--results', metavar='RESULT_PATH', type=str,
+                        help='path to the directory where to store the results', default='./uci_results')
    parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
                        help='path to the directory with svmperf')
    parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1401,7 +1401,7 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
        """
        If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
        instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
-        `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
+        `i` belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
        can end up be attributed to 0, 1, or more classes.
        If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
        and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
@ -1422,6 +1422,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
        prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions)
        return F.normalize_prevalence(prevalences)

+    def aggregation_fit(self, classif_predictions, labels):
+        self._parallel(self._delayed_binary_aggregate_fit(c, classif_predictions, labels))
+        return self
+
    def _delayed_binary_classification(self, c, X):
        return self.dict_binary_quantifiers[c].classify(X)

@ -1429,6 +1433,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
        # the estimation for the positive class prevalence
        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]

+    def _delayed_binary_aggregate_fit(self, c, classif_predictions, labels):
+        # trains the aggregation function of the cth quantifier
+        return self.dict_binary_quantifiers[c].aggregate_fit(classif_predictions[:, c], labels)
+

 class AggregativeMedianEstimator(BinaryQuantifier):
    """
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@ -89,18 +89,18 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
        self.binary_quantifier = binary_quantifier
        self.n_jobs = qp._get_njobs(n_jobs)

-    def fit(self, data: LabelledCollection, fit_classifier=True):
-        assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
-        assert fit_classifier == True, 'fit_classifier must be True'
+    def fit(self, X, y):
+        self.classes = sorted(np.unique(y))
+        assert len(self.classes)!=2, f'{self.__class__.__name__} expect non-binary data'

-        self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
-        self._parallel(self._delayed_binary_fit, data)
+        self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in self.classes}
+        self._parallel(self._delayed_binary_fit, X, y)
        return self

    def _parallel(self, func, *args, **kwargs):
        return np.asarray(
            Parallel(n_jobs=self.n_jobs, backend='threading')(
-                delayed(func)(c, *args, **kwargs) for c in self.classes_
+                delayed(func)(c, *args, **kwargs) for c in self.classes
            )
        )

@ -108,13 +108,13 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
        prevalences = self._parallel(self._delayed_binary_predict, X)
        return qp.functional.normalize_prevalence(prevalences)

-    @property
-    def classes_(self):
-        return sorted(self.dict_binary_quantifiers.keys())
+    # @property
+    # def classes_(self):
+    #     return sorted(self.dict_binary_quantifiers.keys())

    def _delayed_binary_predict(self, c, X):
        return self.dict_binary_quantifiers[c].predict(X)[1]

-    def _delayed_binary_fit(self, c, data):
-        bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True])
-        self.dict_binary_quantifiers[c].fit(bindata)
+    def _delayed_binary_fit(self, c, X, y):
+        bindata = LabelledCollection(X, y == c, classes=[False, True])
+        self.dict_binary_quantifiers[c].fit(*bindata.Xy)