1
0
Fork 0

refactoring aggregative

This commit is contained in:
Alejandro Moreo Fernandez 2024-01-10 15:39:27 +01:00
parent 2d12ce12b9
commit 6d53b68d7f
2 changed files with 108 additions and 118 deletions

28
.gitignore vendored
View File

@ -130,3 +130,31 @@ dmypy.json
.pyre/ .pyre/
*__pycache__* *__pycache__*
*.pdf
*.zip
*.png
*.csv
*.pkl
*.dataframe
# other projects
LeQua2022
MultiLabel
NewMethods
Ordinal
Retrieval
eDiscovery
poster-cikm
slides-cikm
slides-short-cikm
quick_experiment
svm_perf_quantification/svm_struct
svm_perf_quantification/svm_light
TweetSentQuant

View File

@ -302,36 +302,20 @@ class AggregativeSoftQuantifier(AggregativeQuantifier, ABC):
f'fit_classifier is set to False') f'fit_classifier is set to False')
class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier):
@property
def pos_label(self):
return self.classifier.classes_[1]
@property
def neg_label(self):
return self.classifier.classes_[0]
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
self._check_binary(data, self.__class__.__name__)
return super().fit(data, fit_classifier, val_split)
# class CorrectionbasedAggregativeQuantifier(AggregativeQuantifier):
# """
# Abstract class for quantification methods that carry out an adjustment (or correction) that requires,
# at training time, the predictions to be issued in validation mode, i.e., on a set of held-out data that
# is not the training set. There are three ways in which this distinction can be made, depending on how
# the internal parameter `val_split` is specified, namely, (i) a float in (0, 1) indicating the proportion
# of training instances that should be devoted to validate, or (ii) an integer indicating the
# number of folds to consider in a k-fold cross-validation mode, or (iii) the specific set of data to
# use for validation.
# """
#
# @property
# def val_split(self):
# return self.val_split_
#
# @val_split.setter
# def val_split(self, val_split):
# if isinstance(val_split, LabelledCollection):
# print('warning: setting val_split with a LabelledCollection will be inefficient in'
# 'model selection. Rather pass the LabelledCollection at fit time')
# self.val_split_ = val_split
#
# def fit(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
# print('method from CorrectionbasedAggregativeQuantifier')
# if predict_on is None:
# predict_on = self.val_split
# classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on)
# self.aggregation_fit(classif_predictions, data)
# return self
@ -383,7 +367,7 @@ class ACC(AggregativeCrispQuantifier):
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
@ -476,7 +460,7 @@ class PACC(AggregativeSoftQuantifier):
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
@ -599,7 +583,7 @@ class EMQrecalib(AggregativeSoftQuantifier):
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set (default 0.4); or as an integer, indicating that the predictions be extracted from the training set (default 0.4); or as an integer, indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a collection defining the specific set of data to use for validation. for `k`, default 5); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated. on which the predictions are to be generated.
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence; :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
@ -671,7 +655,7 @@ class EMQrecalib(AggregativeSoftQuantifier):
return posteriors return posteriors
class HDy(AggregativeSoftQuantifier, BinaryQuantifier): class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
""" """
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy). `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
@ -683,10 +667,10 @@ class HDy(AggregativeSoftQuantifier, BinaryQuantifier):
:param classifier: a sklearn's Estimator that generates a binary classifier :param classifier: a sklearn's Estimator that generates a binary classifier
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
@ -701,12 +685,10 @@ class HDy(AggregativeSoftQuantifier, BinaryQuantifier):
:class:`quapy.data.base.LabelledCollection` indicating the validation set itself :class:`quapy.data.base.LabelledCollection` indicating the validation set itself
:return: self :return: self
""" """
self._check_binary(data, self.__class__.__name__)
P, y = classif_predictions.Xy P, y = classif_predictions.Xy
Px = P[:, 1] # takes only the P(y=+1|x) Px = P[:, self.pos_label] # takes only the P(y=+1|x)
self.Pxy1 = Px[y == self.classifier.classes_[1]] self.Pxy1 = Px[y == self.pos_label]
self.Pxy0 = Px[y == self.classifier.classes_[0]] self.Pxy0 = Px[y == self.neg_label]
# pre-compute the histogram for positive and negative examples # pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
@ -725,7 +707,7 @@ class HDy(AggregativeSoftQuantifier, BinaryQuantifier):
# and the final estimated a priori probability was taken as the median of these 11 estimates." # and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013). # (González-Castro, et al., 2013).
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x)
prev_estimations = [] prev_estimations = []
# for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] # for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
@ -752,7 +734,7 @@ class HDy(AggregativeSoftQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev]) return np.asarray([1 - class1_prev, class1_prev])
class DyS(AggregativeSoftQuantifier, BinaryQuantifier): class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
""" """
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS). `DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that
@ -761,14 +743,14 @@ class DyS(AggregativeSoftQuantifier, BinaryQuantifier):
:param classifier: a sklearn's Estimator that generates a binary classifier :param classifier: a sklearn's Estimator that generates a binary classifier
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
:param n_bins: an int with the number of bins to use to compute the histograms. :param n_bins: an int with the number of bins to use to compute the histograms.
:param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
callable function computes the divergence between two distributions (two equally sized arrays). callable function computes the divergence between two distributions (two equally sized arrays).
:param tol: a float with the tolerance for the ternary search algorithm. :param tol: a float with the tolerance for the ternary search algorithm.
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05): def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.tol = tol self.tol = tol
@ -791,22 +773,17 @@ class DyS(AggregativeSoftQuantifier, BinaryQuantifier):
# Left and right are the current bounds; the maximum is between them # Left and right are the current bounds; the maximum is between them
return (left + right) / 2 return (left + right) / 2
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
if val_split is None: Px, y = classif_predictions.Xy
val_split = self.val_split Px = Px[:, self.pos_label] # takes only the P(y=+1|x)
self.Pxy1 = Px[y == self.pos_label]
self._check_binary(data, self.__class__.__name__) self.Pxy0 = Px[y == self.neg_label]
self.classifier, validation = _training_helper(
self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0] self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0]
self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0] self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0]
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x)
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0] Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
divergence = get_divergence(self.divergence) divergence = get_divergence(self.divergence)
@ -819,37 +796,32 @@ class DyS(AggregativeSoftQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev]) return np.asarray([1 - class1_prev, class1_prev])
class SMM(AggregativeSoftQuantifier, BinaryQuantifier): class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
""" """
`SMM method <https://ieeexplore.ieee.org/document/9260028>`_ (SMM). `SMM method <https://ieeexplore.ieee.org/document/9260028>`_ (SMM).
SMM is a simplification of matching distribution methods where the representation of the examples SMM is a simplification of matching distribution methods where the representation of the examples
is created using the mean instead of a histogram. is created using the mean instead of a histogram (conceptually equivalent to PACC).
:param classifier: a sklearn's Estimator that generates a binary classifier. :param classifier: a sklearn's Estimator that generates a binary classifier.
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
if val_split is None: Px, y = classif_predictions.Xy
val_split = self.val_split Px = Px[:, self.pos_label] # takes only the P(y=+1|x)
self.Pxy1 = Px[y == self.pos_label]
self._check_binary(data, self.__class__.__name__) self.Pxy0 = Px[y == self.neg_label]
self.classifier, validation = _training_helper( self.Pxy1_mean = np.mean(self.Pxy1) # equiv. TPR
self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split) self.Pxy0_mean = np.mean(self.Pxy0) # equiv. FPR
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
self.Pxy1_mean = np.mean(self.Pxy1)
self.Pxy0_mean = np.mean(self.Pxy0)
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x) Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x)
Px_mean = np.mean(Px) Px_mean = np.mean(Px)
class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean) class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean)
@ -867,9 +839,9 @@ class DMy(AggregativeSoftQuantifier):
:param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier :param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
validation distribution. validation distribution.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the validation distribution should be estimated via validation data, or as an integer, indicating that the validation distribution should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
:param nbins: number of bins used to discretize the distributions (default 8) :param nbins: number of bins used to discretize the distributions (default 8)
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented) :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
@ -890,7 +862,7 @@ class DMy(AggregativeSoftQuantifier):
self.n_jobs = n_jobs self.n_jobs = n_jobs
# @classmethod # @classmethod
# def HDy(cls, classifier, val_split=0.4, n_jobs=None): # def HDy(cls, classifier, val_split=5, n_jobs=None):
# from quapy.method.meta import MedianEstimator # from quapy.method.meta import MedianEstimator
# #
# hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD') # hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
@ -1114,7 +1086,7 @@ def newSVMRAE(svmperf_base=None, C=1):
return newELM(svmperf_base, loss='mrae', C=C) return newELM(svmperf_base, loss='mrae', C=C)
class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): class ThresholdOptimization(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
""" """
Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
@ -1127,31 +1099,20 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self._check_binary(data, "Threshold Optimization") P, y = classif_predictions.Xy
self.tpr, self.fpr, self.threshold = self._optimize_threshold(y, P)
if val_split is None:
val_split = self.val_split
self.classifier, y, y_, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.cc = CC(self.classifier)
self.tpr, self.fpr = self._optimize_threshold(y, y_)
return self return self
@abstractmethod @abstractmethod
@ -1173,14 +1134,15 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation)
:param probabilities: array-like with the posterior probabilities :param probabilities: array-like with the posterior probabilities
:return: best `tpr` and `fpr` according to `_condition` :return: best `tpr` and `fpr` and `threshold` according to `_condition`
""" """
best_candidate_threshold_score = None best_candidate_threshold_score = None
best_tpr = 0 best_tpr = 0
best_fpr = 0 best_fpr = 0
candidate_thresholds = np.unique(probabilities[:, 1]) candidate_thresholds = np.unique(probabilities[:, self.pos_label])
for candidate_threshold in candidate_thresholds: for candidate_threshold in candidate_thresholds:
y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] y_ = self.classes_[1*(probabilities[:,1]>candidate_threshold)]
#y_ = [self.pos_label if p > candidate_threshold else self.neg_label for p in probabilities[:, 1]]
TP, FP, FN, TN = self._compute_table(y, y_) TP, FP, FN, TN = self._compute_table(y, y_)
tpr = self._compute_tpr(TP, FP) tpr = self._compute_tpr(TP, FP)
fpr = self._compute_fpr(FP, TN) fpr = self._compute_fpr(FP, TN)
@ -1190,15 +1152,15 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
best_tpr = tpr best_tpr = tpr
best_fpr = fpr best_fpr = fpr
return best_tpr, best_fpr return best_tpr, best_fpr, best_candidate_threshold_score
def aggregate(self, classif_predictions): def aggregate(self, classif_predictions):
prevs_estim = self.cc.aggregate(classif_predictions) class_scores = classif_predictions[:, self.pos_label]
if self.tpr - self.fpr == 0: prev_estim = np.mean(class_scores > self.threshold)
return prevs_estim if self.tpr - self.fpr != 0:
adjusted_prevs_estim = np.clip((prevs_estim[1] - self.fpr) / (self.tpr - self.fpr), 0, 1) prevs_estim = np.clip((prev_estim - self.fpr) / (self.tpr - self.fpr), 0, 1)
adjusted_prevs_estim = np.array((1 - adjusted_prevs_estim, adjusted_prevs_estim)) prevs_estim = np.array((1 - prevs_estim, prevs_estim))
return adjusted_prevs_estim return prevs_estim
def _compute_table(self, y, y_): def _compute_table(self, y, y_):
TP = np.logical_and(y == y_, y == self.classes_[1]).sum() TP = np.logical_and(y == y_, y == self.classes_[1]).sum()
@ -1229,13 +1191,13 @@ class T50(ThresholdOptimization):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
@ -1253,13 +1215,13 @@ class MAX(ThresholdOptimization):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
@ -1278,13 +1240,13 @@ class X(ThresholdOptimization):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
@ -1302,12 +1264,12 @@ class MS(ThresholdOptimization):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
@ -1339,12 +1301,12 @@ class MS2(MS):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _optimize_threshold(self, y, probabilities): def _optimize_threshold(self, y, probabilities):