forked from moreo/QuaPy
refactoring aggregative methods as methods that not only implement 'classify' and 'quantify', but that also implement 'aggregate' and that, by default, have a default implementation of 'quantify' as a pipeline of 'classify' and 'aggregate'; this helps speeding up evaluations A LOT, since the documents can be pre-classified and the samples are carried out across pre-classified values (labels, or posterior probabilities), and thus only aggregate is called many times within the artificial sampling protocol
This commit is contained in:
parent
e55caf82fd
commit
c8a1a70c8a
2
TODO.txt
2
TODO.txt
|
@ -7,5 +7,5 @@ Add readers for typical datasets used in Quantification
|
|||
Add NAE, NRAE
|
||||
Add "measures for evaluating ordinal"?
|
||||
Document methods with paper references
|
||||
The parallel training in svmperf seems not to work
|
||||
The parallel training in svmperf seems not to work (not sure...)
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from data import LabelledCollection
|
||||
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
|
||||
from method.base import BaseQuantifier
|
||||
from utils.util import temp_seed
|
||||
import numpy as np
|
||||
|
@ -10,8 +11,8 @@ def artificial_sampling_prediction(
|
|||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
prevalence_points=21,
|
||||
point_repetitions=1,
|
||||
n_prevpoints=210,
|
||||
n_repetitions=1,
|
||||
n_jobs=-1,
|
||||
random_seed=42):
|
||||
"""
|
||||
|
@ -19,27 +20,40 @@ def artificial_sampling_prediction(
|
|||
:param model: the model in charge of generating the class prevalence estimations
|
||||
:param test: the test set on which to perform arificial sampling
|
||||
:param sample_size: the size of the samples
|
||||
:param prevalence_points: the number of different prevalences to sample
|
||||
:param point_repetitions: the number of repetitions for each prevalence
|
||||
:param n_prevpoints: the number of different prevalences to sample
|
||||
:param n_repetitions: the number of repetitions for each prevalence
|
||||
:param n_jobs: number of jobs to be run in parallel
|
||||
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||
any other random process.
|
||||
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
|
||||
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
|
||||
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||
containing the the prevalences estimations
|
||||
"""
|
||||
|
||||
with temp_seed(random_seed):
|
||||
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
|
||||
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
|
||||
|
||||
if isinstance(model, AggregativeQuantifier):
|
||||
quantification_func = model.aggregate
|
||||
if isinstance(model, AggregativeProbabilisticQuantifier):
|
||||
print('\tpreclassifying with soft')
|
||||
preclassified_instances = model.posterior_probabilities(test.instances)
|
||||
else:
|
||||
print('\tpreclassifying with hard')
|
||||
preclassified_instances = model.classify(test.instances)
|
||||
test = LabelledCollection(preclassified_instances, test.labels)
|
||||
else:
|
||||
quantification_func = model.quantify
|
||||
print('not an aggregative')
|
||||
|
||||
def _predict_prevalences(index):
|
||||
sample = test.sampling_from_index(index)
|
||||
true_prevalence = sample.prevalence()
|
||||
estim_prevalence = model.quantify(sample.instances)
|
||||
estim_prevalence = quantification_func(sample.instances)
|
||||
return true_prevalence, estim_prevalence
|
||||
|
||||
results = Parallel(n_jobs=n_jobs)(
|
||||
delayed(_predict_prevalences)(index) for index in tqdm(indexes)
|
||||
delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting')
|
||||
)
|
||||
|
||||
true_prevalences, estim_prevalences = zip(*results)
|
||||
|
|
|
@ -36,6 +36,8 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
|||
|
||||
|
||||
def prevalence_from_labels(labels, n_classes):
|
||||
if labels.ndim != 1:
|
||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||
unique, counts = np.unique(labels, return_counts=True)
|
||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
||||
|
@ -44,6 +46,8 @@ def prevalence_from_labels(labels, n_classes):
|
|||
|
||||
|
||||
def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
||||
if posteriors.ndim != 2:
|
||||
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
|
||||
if binarize:
|
||||
predictions = np.argmax(posteriors, axis=-1)
|
||||
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
||||
|
@ -78,15 +82,15 @@ def normalize_prevalence(prevalences):
|
|||
|
||||
|
||||
|
||||
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
||||
def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1):
|
||||
"""
|
||||
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
|
||||
prevalences are generated and nrepeats repetitions are requested
|
||||
:param nclasses: number of classes
|
||||
:param nprevpoints: number of prevalence points.
|
||||
:param nrepeats: number of repetitions for each prevalence combination
|
||||
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
|
||||
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
||||
Computes the number of prevalence combinations in the n_classes-dimensional simplex if nprevpoints equally distant
|
||||
prevalences are generated and n_repeats repetitions are requested
|
||||
:param n_classes: number of classes
|
||||
:param n_prevpoints: number of prevalence points.
|
||||
:param n_repeats: number of repetitions for each prevalence combination
|
||||
:return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the
|
||||
number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
||||
"""
|
||||
__cache={}
|
||||
def __f(nc,np):
|
||||
|
@ -98,25 +102,25 @@ def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
|||
x = sum([__f(nc-1, np-i) for i in range(np)])
|
||||
__cache[(nc,np)] = x
|
||||
return x
|
||||
return __f(nclasses, nprevpoints) * nrepeats
|
||||
return __f(n_classes, n_prevpoints) * n_repeats
|
||||
|
||||
|
||||
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
|
||||
def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1):
|
||||
"""
|
||||
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
|
||||
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
|
||||
Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so that
|
||||
the number of valid prevalences generated as combinations of prevalence points (points in a n_classes-dimensional
|
||||
simplex) do not exceed combinations_budget.
|
||||
:param nclasses: number of classes
|
||||
:param nrepeats: number of repetitions for each prevalence combination
|
||||
:param n_classes: number of classes
|
||||
:param n_repeats: number of repetitions for each prevalence combination
|
||||
:param combinations_budget: maximum number of combinatios allowed
|
||||
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
|
||||
"""
|
||||
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
|
||||
nprevpoints = 1
|
||||
assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers'
|
||||
n_prevpoints = 1
|
||||
while True:
|
||||
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
|
||||
combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats)
|
||||
if combinations > combinations_budget:
|
||||
return nprevpoints-1
|
||||
return n_prevpoints-1
|
||||
else:
|
||||
nprevpoints+=1
|
||||
n_prevpoints += 1
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ AGGREGATIVE_METHODS = {
|
|||
agg.AdjustedClassifyAndCount,
|
||||
agg.ProbabilisticClassifyAndCount,
|
||||
agg.ProbabilisticAdjustedClassifyAndCount,
|
||||
agg.ExplicitLossMinimisation,
|
||||
agg.ExplicitLossMinimisationBinary,
|
||||
agg.ExpectationMaximizationQuantifier,
|
||||
agg.HellingerDistanceY
|
||||
}
|
||||
|
|
|
@ -34,6 +34,13 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
def classify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
classif_predictions = self.classify(instances)
|
||||
return self.aggregate(classif_predictions, *args)
|
||||
|
||||
@abstractmethod
|
||||
def aggregate(self, classif_predictions:np.ndarray, *args): ...
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.learner.get_params()
|
||||
|
||||
|
@ -53,13 +60,17 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
|||
"""
|
||||
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
|
||||
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
|
||||
Quantifiersimplement by implementing a _soft_classify_ method returning values in [0,1] -- the posterior
|
||||
Quantifiersimplement by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior
|
||||
probabilities.
|
||||
"""
|
||||
|
||||
def soft_classify(self, data):
|
||||
def posterior_probabilities(self, data):
|
||||
return self.learner.predict_proba(data)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
classif_posteriors = self.posterior_probabilities(instances)
|
||||
return self.aggregate(classif_posteriors, *args)
|
||||
|
||||
def set_params(self, **parameters):
|
||||
if isinstance(self.learner, CalibratedClassifierCV):
|
||||
parameters={'base_estimator__'+k:v for k,v in parameters.items()}
|
||||
|
@ -128,9 +139,8 @@ class ClassifyAndCount(AggregativeQuantifier):
|
|||
self.learner, _ = training_helper(self.learner, data, fit_learner)
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
classification = self.classify(instances) # classify
|
||||
return F.prevalence_from_labels(classification, self.n_classes) # & count
|
||||
def aggregate(self, classif_predictions, *args):
|
||||
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
||||
|
||||
|
||||
class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||
|
@ -141,17 +151,24 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
|||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
||||
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
|
||||
self.cc = ClassifyAndCount(self.learner)
|
||||
y_ = self.cc.classify(validation.instances)
|
||||
y_ = self.classify(validation.instances)
|
||||
y = validation.labels
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
prevs_estim = self.cc.quantify(instances)
|
||||
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
|
||||
A = self.Pte_cond_estim_
|
||||
def classify(self, data):
|
||||
return self.cc.classify(data)
|
||||
|
||||
def aggregate(self, classif_predictions, *args):
|
||||
prevs_estim = self.cc.aggregate(classif_predictions)
|
||||
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||
|
||||
@classmethod
|
||||
def solve_adjustment(cls, PteCondEstim, prevs_estim):
|
||||
# solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim
|
||||
A = PteCondEstim
|
||||
B = prevs_estim
|
||||
try:
|
||||
adjusted_prevs = np.linalg.solve(A, B)
|
||||
|
@ -161,9 +178,6 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
|||
adjusted_prevs = prevs_estim # no way to adjust them!
|
||||
return adjusted_prevs
|
||||
|
||||
def classify(self, data):
|
||||
return self.cc.classify(data)
|
||||
|
||||
|
||||
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||
def __init__(self, learner):
|
||||
|
@ -173,13 +187,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
|||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
posteriors = self.soft_classify(instances) # classify
|
||||
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
|
||||
return prevalences
|
||||
def aggregate(self, classif_posteriors, *args):
|
||||
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
||||
|
||||
|
||||
class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
||||
class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||
|
||||
def __init__(self, learner):
|
||||
self.learner = learner
|
||||
|
@ -189,28 +201,23 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
|||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
|
||||
)
|
||||
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
||||
y_ = self.pcc.classify(validation.instances)
|
||||
y_ = self.classify(validation.instances)
|
||||
y = validation.labels
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
prevs_estim = self.pcc.quantify(instances)
|
||||
A = self.Pte_cond_estim_
|
||||
B = prevs_estim
|
||||
try:
|
||||
adjusted_prevs = np.linalg.solve(A, B)
|
||||
adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
|
||||
adjusted_prevs /= adjusted_prevs.sum()
|
||||
except np.linalg.LinAlgError:
|
||||
adjusted_prevs = prevs_estim # no way to adjust them!
|
||||
return adjusted_prevs
|
||||
def aggregate(self, classif_posteriors, *args):
|
||||
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
||||
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||
|
||||
def classify(self, data):
|
||||
return self.pcc.classify(data)
|
||||
|
||||
def soft_classify(self, data):
|
||||
return self.pcc.posterior_probabilities(data)
|
||||
|
||||
|
||||
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||
|
||||
|
@ -226,10 +233,8 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
|||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
|
||||
return self
|
||||
|
||||
def quantify(self, X, epsilon=EPSILON):
|
||||
tr_prev=self.train_prevalence
|
||||
posteriors = self.soft_classify(X)
|
||||
return self.EM(tr_prev, posteriors, self.verbose, epsilon)
|
||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||
return self.EM(self.train_prevalence, classif_posteriors, self.verbose, epsilon)
|
||||
|
||||
@classmethod
|
||||
def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON):
|
||||
|
@ -277,17 +282,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
|||
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
||||
self.learner, validation = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
||||
Px = self.soft_classify(validation.instances)
|
||||
Px = self.posterior_probabilities(validation.instances)
|
||||
self.Pxy1 = Px[validation.labels == 1]
|
||||
self.Pxy0 = Px[validation.labels == 0]
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
def aggregate(self, classif_posteriors, *args):
|
||||
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
||||
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
||||
# (González-Castro, et al., 2013).
|
||||
|
||||
Px = self.soft_classify(instances)
|
||||
Px = classif_posteriors
|
||||
|
||||
prev_estimations = []
|
||||
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
||||
|
@ -318,71 +323,87 @@ class OneVsAll(AggregativeQuantifier):
|
|||
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||
"""
|
||||
|
||||
def __init__(self, binary_method, n_jobs=-1):
|
||||
self.binary_method = binary_method
|
||||
def __init__(self, binary_quantifier, n_jobs=-1):
|
||||
self.binary_quantifier = binary_quantifier
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data: LabelledCollection, **kwargs):
|
||||
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||
)
|
||||
assert not data.binary, \
|
||||
f'{self.__class__.__name__} expect non-binary data'
|
||||
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
||||
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
||||
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
||||
self.__parallel(self._delayed_binary_fit, data, **kwargs)
|
||||
return self
|
||||
|
||||
def classify(self, instances):
|
||||
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
||||
return classif_predictions_bin.T
|
||||
|
||||
def aggregate(self, classif_predictions_bin, *args):
|
||||
assert set(np.unique(classif_predictions_bin)) == {0,1}, \
|
||||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||
'predictions for each document (row) and class (columns)'
|
||||
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
def quantify(self, X, *args):
|
||||
prevalences = np.asarray(
|
||||
prevalences = self.__parallel(self._delayed_binary_quantify, X)
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
def __parallel(self, func, *args, **kwargs):
|
||||
return np.asarray(
|
||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
||||
delayed(func)(c, *args, **kwargs) for c in self.classes
|
||||
)
|
||||
)
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
print('one vs all: ', prevalences)
|
||||
>>>>>>> 2361186a01c53e744f4291e2e2299700216ff139
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
return sorted(self.class_method.keys())
|
||||
return sorted(self.dict_binary_quantifiers.keys())
|
||||
|
||||
def set_params(self, **parameters):
|
||||
self.binary_method.set_params(**parameters)
|
||||
self.binary_quantifier.set_params(**parameters)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.binary_method.get_params()
|
||||
return self.binary_quantifier.get_params()
|
||||
|
||||
def _delayed_binary_predict(self, c, learners, X):
|
||||
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
||||
def _delayed_binary_classification(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].classify(X)
|
||||
|
||||
def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
||||
def _delayed_binary_quantify(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
|
||||
|
||||
def _delayed_binary_aggregate(self, c, classif_predictions):
|
||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
|
||||
|
||||
def _delayed_binary_fit(self, c, data, **kwargs):
|
||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
learners[c].fit(bindata, **kwargs)
|
||||
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
|
||||
|
||||
|
||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||
"""
|
||||
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
"""
|
||||
|
||||
def __init__(self, svmperf_base, loss, **kwargs):
|
||||
self.svmperf_base = svmperf_base
|
||||
self.loss = loss
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||
assert fit_learner, 'the method requires that fit_learner=True'
|
||||
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||
if not data.binary:
|
||||
self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||
return self.learner.fit(data, *args)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
return self.learner.quantify(instances, *args)
|
||||
# class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||
# """
|
||||
# A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||
# quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||
# This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
# Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
# """
|
||||
#
|
||||
# def __init__(self, svmperf_base, loss, **kwargs):
|
||||
# self.svmperf_base = svmperf_base
|
||||
# self.loss = loss
|
||||
# self.kwargs = kwargs
|
||||
#
|
||||
# def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||
# assert fit_learner, 'the method requires that fit_learner=True'
|
||||
# self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||
# if not data.binary:
|
||||
# self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||
# return self.learner.fit(data, *args)
|
||||
#
|
||||
# def aggregate(self, instances, *args):
|
||||
# return self.learner.aggregate(instances, *args)
|
||||
|
||||
|
||||
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
||||
|
@ -398,38 +419,35 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
|||
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
||||
return self
|
||||
|
||||
def quantify(self, X, y=None):
|
||||
predictions = self.learner.predict(X)
|
||||
prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
|
||||
print('binary: ', prev)
|
||||
return prev
|
||||
def aggregate(self, classif_predictions:np.ndarray, *args):
|
||||
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
|
||||
|
||||
def classify(self, X, y=None):
|
||||
return self.learner.predict(X)
|
||||
|
||||
|
||||
|
||||
class SVMQ(ExplicitLossMinimisation):
|
||||
class SVMQ(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||
|
||||
|
||||
class SVMKLD(ExplicitLossMinimisation):
|
||||
class SVMKLD(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
||||
|
||||
|
||||
class SVMNKLD(ExplicitLossMinimisation):
|
||||
class SVMNKLD(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
||||
|
||||
|
||||
class SVMAE(ExplicitLossMinimisation):
|
||||
class SVMAE(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
||||
|
||||
|
||||
class SVMRAE(ExplicitLossMinimisation):
|
||||
class SVMRAE(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
||||
|
||||
|
@ -438,7 +456,7 @@ CC = ClassifyAndCount
|
|||
ACC = AdjustedClassifyAndCount
|
||||
PCC = ProbabilisticClassifyAndCount
|
||||
PACC = ProbabilisticAdjustedClassifyAndCount
|
||||
ELM = ExplicitLossMinimisation
|
||||
ELM = ExplicitLossMinimisationBinary
|
||||
EMQ = ExpectationMaximizationQuantifier
|
||||
HDy = HellingerDistanceY
|
||||
|
||||
|
|
|
@ -18,3 +18,48 @@ class BaseQuantifier(metaclass=ABCMeta):
|
|||
def get_params(self, deep=True): ...
|
||||
|
||||
|
||||
# class OneVsAll(AggregativeQuantifier):
|
||||
# """
|
||||
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
|
||||
# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||
# """
|
||||
#
|
||||
# def __init__(self, binary_method, n_jobs=-1):
|
||||
# self.binary_method = binary_method
|
||||
# self.n_jobs = n_jobs
|
||||
#
|
||||
# def fit(self, data: LabelledCollection, **kwargs):
|
||||
# assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||
# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||
# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||
# Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||
# )
|
||||
# return self
|
||||
#
|
||||
# def quantify(self, X, *args):
|
||||
# prevalences = np.asarray(
|
||||
# Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
||||
# )
|
||||
# )
|
||||
# return F.normalize_prevalence(prevalences)
|
||||
#
|
||||
# @property
|
||||
# def classes(self):
|
||||
# return sorted(self.class_method.keys())
|
||||
#
|
||||
# def set_params(self, **parameters):
|
||||
# self.binary_method.set_params(**parameters)
|
||||
#
|
||||
# def get_params(self, deep=True):
|
||||
# return self.binary_method.get_params()
|
||||
#
|
||||
# def _delayed_binary_predict(self, c, learners, X):
|
||||
# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
|
||||
#
|
||||
# def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
||||
# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
# learners[c].fit(bindata, **kwargs)
|
||||
|
||||
|
||||
|
|
29
test.py
29
test.py
|
@ -6,6 +6,7 @@ import quapy.functional as F
|
|||
|
||||
SAMPLE_SIZE=500
|
||||
binary = False
|
||||
svmperf_home = './svm_perf_quantification'
|
||||
|
||||
if binary:
|
||||
# load a textual binary dataset and create a tfidf bag of words
|
||||
|
@ -20,19 +21,31 @@ else:
|
|||
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
||||
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||
print(dataset.training.instances.shape)
|
||||
|
||||
print('dataset loaded')
|
||||
|
||||
# training a quantifier
|
||||
learner = LogisticRegression()
|
||||
model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
|
||||
model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
|
||||
|
||||
if not binary:
|
||||
model = qp.method.aggregative.OneVsAll(model)
|
||||
|
||||
print('fitting model')
|
||||
model.fit(dataset.training)
|
||||
|
||||
|
||||
# estimating class prevalences
|
||||
print('quantifying')
|
||||
prevalences_estim = model.quantify(dataset.test.instances)
|
||||
prevalences_true = dataset.test.prevalence()
|
||||
|
||||
|
@ -46,9 +59,17 @@ print(f'true prevalence {F.strprev(prevalences_true)}')
|
|||
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||
print(f'mae={error:.3f}')
|
||||
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
|
||||
|
||||
qp.error.SAMPLE_SIZE=SAMPLE_SIZE
|
||||
max_evaluations = 5000
|
||||
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
||||
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
||||
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that '
|
||||
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded. '
|
||||
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
|
||||
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||
|
||||
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
|
||||
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||
for error in qp.error.QUANTIFICATION_ERROR:
|
||||
score = error(true_prev, estim_prev)
|
||||
|
|
Loading…
Reference in New Issue