1
0
Fork 0

some refactor made in order to accomodate OneVsAll to operate with aggregative probabilistic quantifiers; launching OneVsAll(HDy)

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-18 16:52:19 +01:00
parent e2eb3b6f06
commit b30c40b7a0
7 changed files with 122 additions and 49 deletions

View File

@ -1,6 +1,6 @@
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quapy as qp import quapy as qp
from quapy.method.aggregative import OneVsAll from quapy.method.aggregative import CC, ACC, PCC, PACC, EMQ, OneVsAll, SVMQ, SVMKLD, SVMNKLD, SVMAE, SVMRAE, HDy
import quapy.functional as F import quapy.functional as F
import numpy as np import numpy as np
import os import os
@ -22,19 +22,26 @@ def quantification_models():
__C_range = np.logspace(-4, 5, 10) __C_range = np.logspace(-4, 5, 10)
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
svmperf_params = {'C': __C_range} svmperf_params = {'C': __C_range}
yield 'cc', qp.method.aggregative.CC(newLR()), lr_params
yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params
yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params
yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params
yield 'sld', qp.method.aggregative.EMQ(newLR()), lr_params
yield 'svmq', OneVsAll(qp.method.aggregative.SVMQ(args.svmperfpath)), svmperf_params
yield 'svmkld', OneVsAll(qp.method.aggregative.SVMKLD(args.svmperfpath)), svmperf_params
yield 'svmnkld', OneVsAll(qp.method.aggregative.SVMNKLD(args.svmperfpath)), svmperf_params
yield 'svmmae', OneVsAll(qp.method.aggregative.SVMAE(args.svmperfpath)), svmperf_params
yield 'svmmrae', OneVsAll(qp.method.aggregative.SVMRAE(args.svmperfpath)), svmperf_params
#sld = qp.method.aggregative.EMQ(newLR()) # methods tested in Gao & Sebastiani 2016
#yield 'paccsld', qp.method.aggregative.PACC(sld), lr_params yield 'cc', CC(newLR()), lr_params
yield 'acc', ACC(newLR()), lr_params
yield 'pcc', PCC(newLR()), lr_params
yield 'pacc', PACC(newLR()), lr_params
yield 'sld', EMQ(newLR()), lr_params
yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params
yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params
yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params
# methods added
yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params
yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params
yield 'hdy', OneVsAll(HDy(newLR())), lr_params
# to add:
# quapy
# ensembles
#
# 'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(), # 'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(),

View File

@ -7,7 +7,7 @@ from . import evaluation
from . import plot from . import plot
from . import util from . import util
from . import model_selection from . import model_selection
from quapy.method.aggregative import isaggregative, isprobabilistic from quapy.method.base import isprobabilistic, isaggregative
environ = { environ = {
@ -21,3 +21,5 @@ environ = {
def isbinary(x): def isbinary(x):
return x.binary return x.binary

View File

@ -8,6 +8,7 @@ import quapy as qp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier from quapy.method.base import BaseQuantifier
from quapy.util import temp_seed from quapy.util import temp_seed
import quapy.functional as F
def artificial_sampling_prediction( def artificial_sampling_prediction(
@ -39,18 +40,18 @@ def artificial_sampling_prediction(
with temp_seed(random_seed): with temp_seed(random_seed):
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions)) indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
if isinstance(model, qp.method.aggregative.AggregativeQuantifier): if model.aggregative: #isinstance(model, qp.method.aggregative.AggregativeQuantifier):
# print('\tinstance of aggregative-quantifier') print('\tinstance of aggregative-quantifier')
quantification_func = model.aggregate quantification_func = model.aggregate
if isinstance(model, qp.method.aggregative.AggregativeProbabilisticQuantifier): if model.probabilistic: # isinstance(model, qp.method.aggregative.AggregativeProbabilisticQuantifier):
# print('\t\tinstance of probabilitstic-aggregative-quantifier') print('\t\tinstance of probabilitstic-aggregative-quantifier')
preclassified_instances = model.posterior_probabilities(test.instances) preclassified_instances = model.posterior_probabilities(test.instances)
else: else:
# print('\t\tinstance of hard-aggregative-quantifier') print('\t\tinstance of hard-aggregative-quantifier')
preclassified_instances = model.classify(test.instances) preclassified_instances = model.classify(test.instances)
test = LabelledCollection(preclassified_instances, test.labels) test = LabelledCollection(preclassified_instances, test.labels)
else: else:
# print('\t\tinstance of base-quantifier') print('\t\tinstance of base-quantifier')
quantification_func = model.quantify quantification_func = model.quantify
def _predict_prevalences(index): def _predict_prevalences(index):

View File

@ -1,7 +1,6 @@
from abc import abstractmethod from abc import abstractmethod
from copy import deepcopy from copy import deepcopy
from typing import Union from typing import Union
import numpy as np import numpy as np
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
@ -60,6 +59,10 @@ class AggregativeQuantifier(BaseQuantifier):
def classes(self): def classes(self):
return self.learner.classes_ return self.learner.classes_
@property
def aggregative(self):
return True
class AggregativeProbabilisticQuantifier(AggregativeQuantifier): class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
""" """
@ -84,6 +87,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
parameters={'base_estimator__'+k:v for k,v in parameters.items()} parameters={'base_estimator__'+k:v for k,v in parameters.items()}
self.learner.set_params(**parameters) self.learner.set_params(**parameters)
@property
def probabilistic(self):
return True
# Helper # Helper
@ -385,6 +391,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x) Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == 1] self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0] self.Pxy0 = Px[validation.labels == 0]
# pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) #[10, 20, 30, ..., 100, 110]
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
@ -395,9 +405,12 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
Px = classif_posteriors[:,1] # takes only the P(y=+1|x) Px = classif_posteriors[:,1] # takes only the P(y=+1|x)
prev_estimations = [] prev_estimations = []
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] #for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) #Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) #Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
for bins in self.bins:
Pxy0_density = self.Pxy0_density[bins]
Pxy1_density = self.Pxy1_density[bins]
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
@ -488,9 +501,7 @@ class OneVsAll(AggregativeQuantifier):
assert isinstance(self.binary_quantifier, BaseQuantifier), \ assert isinstance(self.binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier' f'{self.binary_quantifier} does not seem to be a Quantifier'
assert fit_learner==True, 'fit_learner must be True' assert fit_learner==True, 'fit_learner must be True'
if not isinstance(self.binary_quantifier, BinaryQuantifier):
raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of '
f'{BinaryQuantifier.__class__.__name__}')
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data) self.__parallel(self._delayed_binary_fit, data)
return self return self
@ -502,20 +513,39 @@ class OneVsAll(AggregativeQuantifier):
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T return classif_predictions_bin.T
def posterior_probabilities(self, instances):
# returns a matrix of shape (n,m,2) with n the number of instances and m the number of classes. The entry
# (i,j,1) (resp. (i,j,0)) is a value in [0,1] indicating the posterior probability that instance i belongs
# (resp. does not belong) to class j.
# The posterior probabilities are independent of each other, meaning that, in general, they do not sum
# up to one.
if not self.binary_quantifier.probabilistic:
raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because '
f'the base quantifier {self.binary_quantifier.__class__.__name__} is not '
f'probabilistic')
posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances)
return np.swapaxes(posterior_predictions_bin, 0, 1)
def aggregate(self, classif_predictions_bin): def aggregate(self, classif_predictions_bin):
assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \ if self.probabilistic:
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ assert classif_predictions_bin.shape[1]==self.n_classes and classif_predictions_bin.shape[2]==2, \
'predictions for each document (row) and class (columns)' 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
'probabilities (2 dimensions) for each document (row) and class (columns)'
else:
assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin) prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
#prevalences = []
#for c in self.classes:
# prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin))
#prevalences = np.asarray(prevalences)
return F.normalize_prevalence(prevalences) return F.normalize_prevalence(prevalences)
def quantify(self, X): def quantify(self, X):
prevalences = self.__parallel(self._delayed_binary_quantify, X) if self.probabilistic:
return F.normalize_prevalence(prevalences) predictions = self.posterior_probabilities(X)
else:
predictions = self.classify(X)
return self.aggregate(predictions)
#prevalences = self.__parallel(self._delayed_binary_quantify, X)
#return F.normalize_prevalence(prevalences)
def __parallel(self, func, *args, **kwargs): def __parallel(self, func, *args, **kwargs):
return np.asarray( return np.asarray(
@ -537,9 +567,12 @@ class OneVsAll(AggregativeQuantifier):
def _delayed_binary_classification(self, c, X): def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X) return self.dict_binary_quantifiers[c].classify(X)
def _delayed_binary_quantify(self, c, X): def _delayed_binary_posteriors(self, c, X):
return self.dict_binary_quantifiers[c].posterior_probabilities(X)
#def _delayed_binary_quantify(self, c, X):
# the estimation for the positive class prevalence # the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].quantify(X)[1] # return self.dict_binary_quantifiers[c].quantify(X)[1]
def _delayed_binary_aggregate(self, c, classif_predictions): def _delayed_binary_aggregate(self, c, classif_predictions):
# the estimation for the positive class prevalence # the estimation for the positive class prevalence
@ -549,13 +582,14 @@ class OneVsAll(AggregativeQuantifier):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
self.dict_binary_quantifiers[c].fit(bindata) self.dict_binary_quantifiers[c].fit(bindata)
@property
def binary(self):
return False
@property
def probabilistic(self):
return self.binary_quantifier.probabilistic
def isaggregative(model:BaseQuantifier):
return isinstance(model, AggregativeQuantifier)
def isprobabilistic(model:BaseQuantifier):
return isinstance(model, AggregativeProbabilisticQuantifier)

View File

@ -5,12 +5,10 @@ from quapy.data import LabelledCollection
# Base Quantifier abstract class # Base Quantifier abstract class
# ------------------------------------ # ------------------------------------
class BaseQuantifier(metaclass=ABCMeta): class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def fit(self, data): ... def fit(self, data: LabelledCollection): ...
@abstractmethod @abstractmethod
def quantify(self, instances): ... def quantify(self, instances): ...
@ -21,10 +19,20 @@ class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def get_params(self, deep=True): ... def get_params(self, deep=True): ...
# these methods allows meta-learners to reimplement the decision based on their constituents, and not
# based on class structure
@property @property
def binary(self): def binary(self):
return False return False
@property
def aggregative(self):
return False
@property
def probabilistic(self):
return False
class BinaryQuantifier(BaseQuantifier): class BinaryQuantifier(BaseQuantifier):
def _check_binary(self, data: LabelledCollection, quantifier_name): def _check_binary(self, data: LabelledCollection, quantifier_name):
@ -40,7 +48,15 @@ def isbinary(model:BaseQuantifier):
return model.binary return model.binary
# class OneVsAll(AggregativeQuantifier): def isaggregative(model:BaseQuantifier):
return model.aggregative
def isprobabilistic(model:BaseQuantifier):
return model.probabilistic
# class OneVsAll:
# """ # """
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary # Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. # quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.

View File

@ -152,6 +152,19 @@ class Ensemble(BaseQuantifier):
order = np.argsort(dist) order = np.argsort(dist)
return select_k(predictions, order, k=self.red_size) return select_k(predictions, order, k=self.red_size)
@property
def binary(self):
return self.base_quantifier.binary
@property
def aggregative(self):
raise NotImplementedError('aggregative functionality not yet supported for Ensemble')
@property
def probabilistic(self):
raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
#return self.base_quantifier.probabilistic
def get_probability_distribution(posterior_probabilities, bins=8): def get_probability_distribution(posterior_probabilities, bins=8):
assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem' assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem'

View File

@ -157,7 +157,7 @@ class GridSearchQ(BaseQuantifier):
model.fit(training) model.fit(training)
true_prevalences, estim_prevalences = artificial_sampling_prediction( true_prevalences, estim_prevalences = artificial_sampling_prediction(
model, validation, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed, model, validation, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed,
verbose=False verbose=True
) )
score = self.error(true_prevalences, estim_prevalences) score = self.error(true_prevalences, estim_prevalences)