starting refactor...

This commit is contained in:
Alejandro Moreo Fernandez 2024-04-24 18:01:05 +02:00
parent 9274ea21aa
commit 3051c08184
3 changed files with 110 additions and 103 deletions

View File

@ -75,11 +75,12 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
empty_class_names = data.classes_[empty_classes] empty_class_names = data.classes_[empty_classes]
raise ValueError(f'classes {empty_class_names} have no training examples') raise ValueError(f'classes {empty_class_names} have no training examples')
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): def fit(self, X, y, fit_classifier=True, val_split=None):
""" """
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function. Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates
:param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
:param fit_classifier: whether to train the learner (default is True). Set to False if the :param fit_classifier: whether to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier. learner has been trained outside the quantifier.
:param val_split: specifies the data used for generating classifier predictions. This specification :param val_split: specifies the data used for generating classifier predictions. This specification
@ -92,16 +93,17 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
:return: self :return: self
""" """
self._check_init_parameters() self._check_init_parameters()
classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split) P, y = self.classifier_fit_predict(X, y, fit_classifier, predict_on=val_split)
self.aggregation_fit(classif_predictions, data) self.aggregation_fit(P, y)
return self return self
def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None): def classifier_fit_predict(self, X, y, fit_classifier=True, predict_on=None):
""" """
Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to
train the aggregation function. train the aggregation function.
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates
:param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
:param fit_classifier: whether to train the learner (default is True). Set to False if the :param fit_classifier: whether to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier. learner has been trained outside the quantifier.
:param predict_on: specifies the set on which predictions need to be issued. This parameter can :param predict_on: specifies the set on which predictions need to be issued. This parameter can
@ -113,10 +115,11 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
""" """
assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean' assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean'
data = LabelledCollection(X, y)
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba')) self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
if fit_classifier: if fit_classifier:
self._check_non_empty_classes(data) self._check_non_empty_classes(y)
if predict_on is None: if predict_on is None:
if not fit_classifier: if not fit_classifier:
@ -170,16 +173,16 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
f'use either a float indicating the split proportion, or a ' f'use either a float indicating the split proportion, or a '
f'tuple (X,y) indicating the validation partition') f'tuple (X,y) indicating the validation partition')
return predictions return predictions.Xy
@abstractmethod @abstractmethod
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray):
""" """
Trains the aggregation function. Trains the aggregation function.
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, :param classif_predictions: `array-like` of shape `(n_samples, n_classes)` consisting of the classifier
as instances, the predictions issued by the classifier and, as labels, the true labels predictions for each class
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
""" """
... ...
@ -201,16 +204,16 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
""" """
self.classifier_ = classifier self.classifier_ = classifier
def classify(self, instances): def classify(self, X):
""" """
Provides the label predictions for the given instances. The predictions should respect the format expected by Provides the label predictions for the given instances. The predictions should respect the format expected by
:meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for :meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for
non-probabilistic quantifiers. The default one is "decision_function". non-probabilistic quantifiers. The default one is "decision_function".
:param instances: array-like of shape `(n_instances, n_features,)` :param X: array-like of shape `(n_instances, n_features,)`
:return: np.ndarray of shape `(n_instances,)` with label predictions :return: np.ndarray of shape `(n_instances,)` with label predictions
""" """
return getattr(self.classifier, self._classifier_method())(instances) return getattr(self.classifier, self._classifier_method())(X)
def _classifier_method(self): def _classifier_method(self):
""" """
@ -230,15 +233,15 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
assert hasattr(self.classifier, self._classifier_method()), \ assert hasattr(self.classifier, self._classifier_method()), \
f"the method does not implement the required {self._classifier_method()} method" f"the method does not implement the required {self._classifier_method()} method"
def quantify(self, instances): def quantify(self, X):
""" """
Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated
by the classifier. by the classifier.
:param instances: array-like :param X: array-like
:return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates. :return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates.
""" """
classif_predictions = self.classify(instances) classif_predictions = self.classify(X)
return self.aggregate(classif_predictions) return self.aggregate(classif_predictions)
@abstractmethod @abstractmethod
@ -328,9 +331,9 @@ class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier):
def neg_label(self): def neg_label(self):
return self.classifier.classes_[0] return self.classifier.classes_[0]
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): def fit(self, X, y, fit_classifier=True, val_split=None):
self._check_binary(data, self.__class__.__name__) self._check_binary(y, self.__class__.__name__)
return super().fit(data, fit_classifier, val_split) return super().fit(X, y, fit_classifier, val_split)
# Methods # Methods
@ -346,12 +349,12 @@ class CC(AggregativeCrispQuantifier):
def __init__(self, classifier: BaseEstimator): def __init__(self, classifier: BaseEstimator):
self.classifier = classifier self.classifier = classifier
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray):
""" """
Nothing to do here! Nothing to do here!
:param classif_predictions: not used :param classif_predictions: not used
:param data: not used :param y: not used
""" """
pass pass
@ -376,12 +379,12 @@ class PCC(AggregativeSoftQuantifier):
def __init__(self, classifier: BaseEstimator): def __init__(self, classifier: BaseEstimator):
self.classifier = classifier self.classifier = classifier
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_posteriors: np.ndarray, y: np.ndarray):
""" """
Nothing to do here! Nothing to do here!
:param classif_predictions: not used :param classif_posteriors: not used
:param data: not used :param y: not used
""" """
pass pass
@ -482,17 +485,16 @@ class ACC(AggregativeCrispQuantifier):
if self.norm not in ACC.NORMALIZATIONS: if self.norm not in ACC.NORMALIZATIONS:
raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray):
""" """
Estimates the misclassification rates. Estimates the misclassification rates.
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, :param classif_predictions: `array-like` of shape `(n_samples, n_classes)`
as instances, the label predictions issued by the classifier and, as labels, the true labels consisting of the posterior probabilities of the training examples
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
""" """
pred_labels, true_labels = classif_predictions.Xy
self.cc = CC(self.classifier) self.cc = CC(self.classifier)
self.Pte_cond_estim_ = ACC.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels) self.Pte_cond_estim_ = ACC.getPteCondEstim(self.classifier.classes_, y, classif_predictions)
@classmethod @classmethod
def getPteCondEstim(cls, classes, y, y_): def getPteCondEstim(cls, classes, y, y_):
@ -593,17 +595,15 @@ class PACC(AggregativeSoftQuantifier):
if self.norm not in ACC.NORMALIZATIONS: if self.norm not in ACC.NORMALIZATIONS:
raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray):
""" """
Estimates the misclassification rates Estimates the misclassification rates
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, :param classif_predictions: `array-like` of shape `(n_samples, n_classes)`
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels consisting of the posterior probabilities of the training examples
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels """
"""
posteriors, true_labels = classif_predictions.Xy
self.pcc = PCC(self.classifier) self.pcc = PCC(self.classifier)
self.Pte_cond_estim_ = PACC.getPteCondEstim(self.classifier.classes_, true_labels, posteriors) self.Pte_cond_estim_ = PACC.getPteCondEstim(self.classifier.classes_, y, classif_predictions)
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors) prevs_estim = self.pcc.aggregate(classif_posteriors)

View File

@ -19,21 +19,22 @@ class BaseQuantifier(BaseEstimator):
""" """
@abstractmethod @abstractmethod
def fit(self, data: LabelledCollection): def fit(self, X, y):
""" """
Trains a quantifier. Trains a quantifier.
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates
:param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
:return: self :return: self
""" """
... ...
@abstractmethod @abstractmethod
def quantify(self, instances): def quantify(self, X):
""" """
Generate class prevalence estimates for the sample's instances Generate class prevalence estimates for the sample's instances
:param instances: array-like :param X: `array-like` of shape `(n_samples, n_features)` consisting of the test covariates
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
""" """
... ...
@ -45,8 +46,9 @@ class BinaryQuantifier(BaseQuantifier):
(typically, to be interpreted as one class and its complement). (typically, to be interpreted as one class and its complement).
""" """
def _check_binary(self, data: LabelledCollection, quantifier_name): def _check_binary(self, y, quantifier_name):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ n_classes = len(np.unique(y))
assert n_classes==2, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.' f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
@ -78,7 +80,8 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
self.binary_quantifier = binary_quantifier self.binary_quantifier = binary_quantifier
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_classifier=True): def fit(self, X, y, fit_classifier=True):
data = LabelledCollection(X, y)
assert not data.binary, f'{self.__class__.__name__} expect non-binary data' assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
assert fit_classifier == True, 'fit_classifier must be True' assert fit_classifier == True, 'fit_classifier must be True'
@ -93,8 +96,8 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
) )
) )
def quantify(self, instances): def quantify(self, X):
prevalences = self._parallel(self._delayed_binary_predict, instances) prevalences = self._parallel(self._delayed_binary_predict, X)
return qp.functional.normalize_prevalence(prevalences) return qp.functional.normalize_prevalence(prevalences)
@property @property

View File

@ -8,7 +8,7 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier
import quapy.functional as F import quapy.functional as F
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): class MLPE(BaseQuantifier):
""" """
The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior
probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds). probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds).
@ -20,13 +20,15 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def __init__(self): def __init__(self):
self._classes_ = None self._classes_ = None
def fit(self, data: LabelledCollection): def fit(self, X, y):
""" """
Computes the training prevalence and stores it. Computes the training prevalence and stores it.
:param data: the training sample :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates
:param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
:return: self :return: self
""" """
data = LabelledCollection(X, y)
self.estimated_prevalence = data.prevalence() self.estimated_prevalence = data.prevalence()
return self return self
@ -100,7 +102,7 @@ class DMx(BaseQuantifier):
return distributions return distributions
def fit(self, data: LabelledCollection): def fit(self, X, y):
""" """
Generates the validation distributions out of the training data (covariates). Generates the validation distributions out of the training data (covariates).
The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats` The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats`
@ -109,15 +111,16 @@ class DMx(BaseQuantifier):
training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in
training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin.
:param data: the training set :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates
:param y: `array-like` of shape `(n_samples,)` consisting of the instances labels
:return: self
""" """
X, y = data.Xy data = LabelledCollection(X, y)
self.nfeats = X.shape[1] self.nfeats = X.shape[1]
self.feat_ranges = _get_features_range(X) self.feat_ranges = _get_features_range(X)
self.validation_distribution = np.asarray( self.validation_distribution = np.asarray(
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] [self.__get_distributions(X[y==cat]) for cat in range(data.classes_)]
) )
return self return self
@ -147,53 +150,53 @@ class DMx(BaseQuantifier):
return F.argmin_prevalence(loss, n_classes, method=self.search) return F.argmin_prevalence(loss, n_classes, method=self.search)
class ReadMe(BaseQuantifier): # class ReadMe(BaseQuantifier):
#
def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): # def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
raise NotImplementedError('under development ...') # raise NotImplementedError('under development ...')
self.bootstrap_trials = bootstrap_trials # self.bootstrap_trials = bootstrap_trials
self.bootstrap_range = bootstrap_range # self.bootstrap_range = bootstrap_range
self.bagging_trials = bagging_trials # self.bagging_trials = bagging_trials
self.bagging_range = bagging_range # self.bagging_range = bagging_range
self.vectorizer_kwargs = vectorizer_kwargs # self.vectorizer_kwargs = vectorizer_kwargs
#
def fit(self, data: LabelledCollection): # def fit(self, data: LabelledCollection):
X, y = data.Xy # X, y = data.Xy
self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs) # self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
X = self.vectorizer.fit_transform(X) # X = self.vectorizer.fit_transform(X)
self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)} # self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
#
def quantify(self, instances): # def quantify(self, instances):
X = self.vectorizer.transform(instances) # X = self.vectorizer.transform(instances)
#
# number of features # # number of features
num_docs, num_feats = X.shape # num_docs, num_feats = X.shape
#
# bootstrap # # bootstrap
p_boots = [] # p_boots = []
for _ in range(self.bootstrap_trials): # for _ in range(self.bootstrap_trials):
docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False) # docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()} # class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
Xboot = X[docs_idx] # Xboot = X[docs_idx]
#
# bagging # # bagging
p_bags = [] # p_bags = []
for _ in range(self.bagging_trials): # for _ in range(self.bagging_trials):
feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False) # feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()} # class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
Xbag = Xboot[:,feat_idx] # Xbag = Xboot[:,feat_idx]
p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag) # p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
p_bags.append(p) # p_bags.append(p)
p_boots.append(np.mean(p_bags, axis=0)) # p_boots.append(np.mean(p_bags, axis=0))
#
p_mean = np.mean(p_boots, axis=0) # p_mean = np.mean(p_boots, axis=0)
p_std = np.std(p_bags, axis=0) # p_std = np.std(p_bags, axis=0)
#
return p_mean # return p_mean
#
#
def std_constrained_linear_ls(self, X, class_cond_X: dict): # def std_constrained_linear_ls(self, X, class_cond_X: dict):
pass # pass
def _get_features_range(X): def _get_features_range(X):
@ -209,4 +212,5 @@ def _get_features_range(X):
# aliases # aliases
#--------------------------------------------------------------- #---------------------------------------------------------------
MaximumLikelihoodPrevalenceEstimation = MLPE
DistributionMatchingX = DMx DistributionMatchingX = DMx