some refactor made in order to accomodate OneVsAll to operate with aggregative probabilistic quantifiers; launching OneVsAll(HDy)

2021-01-18 16:52:19 +01:00 · 2021-01-18 16:52:19 +01:00 · b30c40b7a0
parent e2eb3b6f06
commit b30c40b7a0
7 changed files with 122 additions and 49 deletions
--- a/TweetSentQuant/experiments.py
+++ b/TweetSentQuant/experiments.py
@ -1,6 +1,6 @@
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
-from quapy.method.aggregative import OneVsAll
+from quapy.method.aggregative import CC, ACC, PCC, PACC, EMQ, OneVsAll, SVMQ, SVMKLD, SVMNKLD, SVMAE, SVMRAE, HDy
 import quapy.functional as F
 import numpy as np
 import os
@ -22,19 +22,26 @@ def quantification_models():
    __C_range = np.logspace(-4, 5, 10)
    lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
    svmperf_params = {'C': __C_range}
-    yield 'cc', qp.method.aggregative.CC(newLR()), lr_params
-    yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params
-    yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params
-    yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params
-    yield 'sld', qp.method.aggregative.EMQ(newLR()), lr_params
-    yield 'svmq', OneVsAll(qp.method.aggregative.SVMQ(args.svmperfpath)), svmperf_params
-    yield 'svmkld', OneVsAll(qp.method.aggregative.SVMKLD(args.svmperfpath)), svmperf_params
-    yield 'svmnkld', OneVsAll(qp.method.aggregative.SVMNKLD(args.svmperfpath)), svmperf_params
-    yield 'svmmae', OneVsAll(qp.method.aggregative.SVMAE(args.svmperfpath)), svmperf_params
-    yield 'svmmrae', OneVsAll(qp.method.aggregative.SVMRAE(args.svmperfpath)), svmperf_params

-    #sld = qp.method.aggregative.EMQ(newLR())
-    #yield 'paccsld', qp.method.aggregative.PACC(sld), lr_params
+    # methods tested in Gao & Sebastiani 2016
+    yield 'cc', CC(newLR()), lr_params
+    yield 'acc', ACC(newLR()), lr_params
+    yield 'pcc', PCC(newLR()), lr_params
+    yield 'pacc', PACC(newLR()), lr_params
+    yield 'sld', EMQ(newLR()), lr_params
+    yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params
+    yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params
+    yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params
+
+    # methods added
+    yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params
+    yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params
+    yield 'hdy', OneVsAll(HDy(newLR())), lr_params
+
+    # to add:
+    # quapy
+    # ensembles
+    #

 #     'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(),

--- a/quapy/init.py
+++ b/quapy/init.py
@ -7,7 +7,7 @@ from . import evaluation
 from . import plot
 from . import util
 from . import model_selection
-from quapy.method.aggregative import isaggregative, isprobabilistic
+from quapy.method.base import isprobabilistic, isaggregative


 environ = {
@ -21,3 +21,5 @@ environ = {

 def isbinary(x):
    return x.binary
+
+
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -8,6 +8,7 @@ import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier
 from quapy.util import temp_seed
+import quapy.functional as F


 def artificial_sampling_prediction(
@ -39,18 +40,18 @@ def artificial_sampling_prediction(
    with temp_seed(random_seed):
        indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))

-    if isinstance(model, qp.method.aggregative.AggregativeQuantifier):
-        # print('\tinstance of aggregative-quantifier')
+    if model.aggregative: #isinstance(model, qp.method.aggregative.AggregativeQuantifier):
+        print('\tinstance of aggregative-quantifier')
        quantification_func = model.aggregate
-        if isinstance(model, qp.method.aggregative.AggregativeProbabilisticQuantifier):
-            # print('\t\tinstance of probabilitstic-aggregative-quantifier')
+        if model.probabilistic: # isinstance(model, qp.method.aggregative.AggregativeProbabilisticQuantifier):
+            print('\t\tinstance of probabilitstic-aggregative-quantifier')
            preclassified_instances = model.posterior_probabilities(test.instances)
        else:
-            # print('\t\tinstance of hard-aggregative-quantifier')
+            print('\t\tinstance of hard-aggregative-quantifier')
            preclassified_instances = model.classify(test.instances)
        test = LabelledCollection(preclassified_instances, test.labels)
    else:
-        # print('\t\tinstance of base-quantifier')
+        print('\t\tinstance of base-quantifier')
        quantification_func = model.quantify

    def _predict_prevalences(index):
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1,7 +1,6 @@
 from abc import abstractmethod
 from copy import deepcopy
 from typing import Union
-
 import numpy as np
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
@ -60,6 +59,10 @@ class AggregativeQuantifier(BaseQuantifier):
    def classes(self):
        return self.learner.classes_

+    @property
+    def aggregative(self):
+        return True
+

 class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
    """
@ -84,6 +87,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
            parameters={'base_estimator__'+k:v for k,v in parameters.items()}
        self.learner.set_params(**parameters)

+    @property
+    def probabilistic(self):
+        return True


 # Helper
@ -385,6 +391,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        Px = self.posterior_probabilities(validation.instances)[:,1]  # takes only the P(y=+1|x)
        self.Pxy1 = Px[validation.labels == 1]
        self.Pxy0 = Px[validation.labels == 0]
+        # pre-compute the histogram for positive and negative examples
+        self.bins = np.linspace(10, 110, 11, dtype=int)  #[10, 20, 30, ..., 100, 110]
+        self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
+        self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
        return self

    def aggregate(self, classif_posteriors):
@ -395,9 +405,12 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        Px = classif_posteriors[:,1]  # takes only the P(y=+1|x)

        prev_estimations = []
-        for bins in np.linspace(10, 110, 11, dtype=int):  #[10, 20, 30, ..., 100, 110]
-            Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
-            Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
+        #for bins in np.linspace(10, 110, 11, dtype=int):  #[10, 20, 30, ..., 100, 110]
+            #Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
+            #Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
+        for bins in self.bins:
+            Pxy0_density = self.Pxy0_density[bins]
+            Pxy1_density = self.Pxy1_density[bins]

            Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)

@ -488,9 +501,7 @@ class OneVsAll(AggregativeQuantifier):
        assert isinstance(self.binary_quantifier, BaseQuantifier), \
            f'{self.binary_quantifier} does not seem to be a Quantifier'
        assert fit_learner==True, 'fit_learner must be True'
-        if not isinstance(self.binary_quantifier, BinaryQuantifier):
-            raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of '
-                             f'{BinaryQuantifier.__class__.__name__}')
+
        self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
        self.__parallel(self._delayed_binary_fit, data)
        return self
@ -502,20 +513,39 @@ class OneVsAll(AggregativeQuantifier):
        classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
        return classif_predictions_bin.T

+    def posterior_probabilities(self, instances):
+        # returns a matrix of shape (n,m,2) with n the number of instances and m the number of classes. The entry
+        # (i,j,1) (resp. (i,j,0)) is a value in [0,1] indicating the posterior probability that instance i belongs
+        # (resp. does not belong) to class j.
+        # The posterior probabilities are independent of each other, meaning that, in general, they do not sum
+        # up to one.
+        if not self.binary_quantifier.probabilistic:
+            raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
+                                      f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
+                                      f'probabilistic')
+        posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
+        return np.swapaxes(posterior_predictions_bin, 0, 1)
+
    def aggregate(self, classif_predictions_bin):
-        assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \
-            'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
-            'predictions for each document (row) and class (columns)'
+        if self.probabilistic:
+            assert classif_predictions_bin.shape[1]==self.n_classes and classif_predictions_bin.shape[2]==2, \
+                'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
+                'probabilities (2 dimensions) for each document (row) and class (columns)'
+        else:
+            assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \
+                'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
+                'predictions for each document (row) and class (columns)'
        prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
-        #prevalences = []
-        #for c in self.classes:
-        #    prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin))
-        #prevalences = np.asarray(prevalences)
        return F.normalize_prevalence(prevalences)

    def quantify(self, X):
-        prevalences = self.__parallel(self._delayed_binary_quantify, X)
-        return F.normalize_prevalence(prevalences)
+        if self.probabilistic:
+            predictions = self.posterior_probabilities(X)
+        else:
+            predictions = self.classify(X)
+        return self.aggregate(predictions)
+        #prevalences = self.__parallel(self._delayed_binary_quantify, X)
+        #return F.normalize_prevalence(prevalences)

    def __parallel(self, func, *args, **kwargs):
        return np.asarray(
@ -537,9 +567,12 @@ class OneVsAll(AggregativeQuantifier):
    def _delayed_binary_classification(self, c, X):
        return self.dict_binary_quantifiers[c].classify(X)

-    def _delayed_binary_quantify(self, c, X):
+    def _delayed_binary_posteriors(self, c, X):
+        return self.dict_binary_quantifiers[c].posterior_probabilities(X)
+
+    #def _delayed_binary_quantify(self, c, X):
        # the estimation for the positive class prevalence
-        return self.dict_binary_quantifiers[c].quantify(X)[1]
+    #    return self.dict_binary_quantifiers[c].quantify(X)[1]

    def _delayed_binary_aggregate(self, c, classif_predictions):
        # the estimation for the positive class prevalence
@ -549,13 +582,14 @@ class OneVsAll(AggregativeQuantifier):
        bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
        self.dict_binary_quantifiers[c].fit(bindata)

+    @property
+    def binary(self):
+        return False
+
+    @property
+    def probabilistic(self):
+        return self.binary_quantifier.probabilistic

-def isaggregative(model:BaseQuantifier):
-    return isinstance(model, AggregativeQuantifier)
-
-
-def isprobabilistic(model:BaseQuantifier):
-    return isinstance(model, AggregativeProbabilisticQuantifier)



--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@ -5,12 +5,10 @@ from quapy.data import LabelledCollection

 # Base Quantifier abstract class
 # ------------------------------------
-
-
 class BaseQuantifier(metaclass=ABCMeta):

    @abstractmethod
-    def fit(self, data): ...
+    def fit(self, data: LabelledCollection): ...

    @abstractmethod
    def quantify(self, instances): ...
@ -21,10 +19,20 @@ class BaseQuantifier(metaclass=ABCMeta):
    @abstractmethod
    def get_params(self, deep=True): ...

+    # these methods allows meta-learners to reimplement the decision based on their constituents, and not
+    # based on class structure
    @property
    def binary(self):
        return False

+    @property
+    def aggregative(self):
+        return False
+
+    @property
+    def probabilistic(self):
+        return False
+

 class BinaryQuantifier(BaseQuantifier):
    def _check_binary(self, data: LabelledCollection, quantifier_name):
@ -40,7 +48,15 @@ def isbinary(model:BaseQuantifier):
    return model.binary


-# class OneVsAll(AggregativeQuantifier):
+def isaggregative(model:BaseQuantifier):
+    return model.aggregative
+
+
+def isprobabilistic(model:BaseQuantifier):
+    return model.probabilistic
+
+
+# class OneVsAll:
 #     """
 #     Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
 #     quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -152,6 +152,19 @@ class Ensemble(BaseQuantifier):
        order = np.argsort(dist)
        return select_k(predictions, order, k=self.red_size)

+    @property
+    def binary(self):
+        return self.base_quantifier.binary
+
+    @property
+    def aggregative(self):
+        raise NotImplementedError('aggregative functionality not yet supported for Ensemble')
+
+    @property
+    def probabilistic(self):
+        raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
+        #return self.base_quantifier.probabilistic
+

 def get_probability_distribution(posterior_probabilities, bins=8):
    assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem'
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -157,7 +157,7 @@ class GridSearchQ(BaseQuantifier):
                model.fit(training)
                true_prevalences, estim_prevalences = artificial_sampling_prediction(
                    model, validation, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed,
-                    verbose=False
+                    verbose=True
                )

                score = self.error(true_prevalences, estim_prevalences)