1
0
Fork 0

adding calibration methods from the abstension package to quapy

This commit is contained in:
Alejandro Moreo Fernandez 2023-01-18 19:46:19 +01:00
parent 1d4fa40f3e
commit 09abcfc935
4 changed files with 191 additions and 5 deletions

View File

@ -34,7 +34,8 @@
- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances - newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances
with the plain python type (e.g., float). with the plain python type (e.g., float).
- new dependency "abstention" (to add to the project requirements and setup) - new dependency "abstention" (to add to the project requirements and setup). Calibration methods from
https://github.com/kundajelab/abstention added.
Things to fix: Things to fix:
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.) - calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)

View File

@ -0,0 +1,166 @@
from copy import deepcopy
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
from sklearn.base import BaseEstimator, clone
from sklearn.model_selection import cross_val_predict, train_test_split
import numpy as np
# Wrappers of calibration defined by Alexandari et al. in paper <http://proceedings.mlr.press/v119/alexandari20a.html>
# requires "pip install abstension"
# see https://github.com/kundajelab/abstention
class RecalibratedClassifier:
pass
class RecalibratedClassifierBase(BaseEstimator, RecalibratedClassifier):
"""
Applies a (re)calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier
:param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory)
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards.
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
:param verbose: whether or not to display information in the standard output
"""
def __init__(self, estimator, calibrator, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator
self.calibrator = calibrator
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
def fit(self, X, y):
k = self.val_split
if isinstance(k, int):
if k < 2:
raise ValueError('wrong value for val_split: the number of folds must be > 2')
return self.fit_cv(X, y)
elif isinstance(k, float):
if not (0 < k < 1):
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
return self.fit_cv(X, y)
def fit_cv(self, X, y):
posteriors = cross_val_predict(
self.estimator, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method="predict_proba"
)
self.estimator.fit(X, y)
nclasses = len(np.unique(y))
self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True)
return self
def fit_tr_val(self, X, y):
Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y)
self.estimator.fit(Xtr, ytr)
posteriors = self.estimator.predict_proba(Xva)
nclasses = len(np.unique(yva))
self.calibrator = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True)
return self
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
posteriors = self.estimator.predict_proba(X)
return self.calibration_function(posteriors)
@property
def classes_(self):
return self.estimator.classes_
class NBVSCalibration(RecalibratedClassifierBase):
"""
Applies the No-Bias Vector Scaling (NBVS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards.
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
:param verbose: whether or not to display information in the standard output
"""
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator
self.calibrator = NoBiasVectorScaling(verbose=verbose)
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
class BCTSCalibration(RecalibratedClassifierBase):
"""
Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards.
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
:param verbose: whether or not to display information in the standard output
"""
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator
self.calibrator = TempScaling(verbose=verbose, bias_positions='all')
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
class TSCalibration(RecalibratedClassifierBase):
"""
Applies the Temperature Scaling (TS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards.
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
:param verbose: whether or not to display information in the standard output
"""
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator
self.calibrator = TempScaling(verbose=verbose)
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose
class VSCalibration(RecalibratedClassifierBase):
"""
Applies the Vector Scaling (VS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards.
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
:param verbose: whether or not to display information in the standard output
"""
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator
self.calibrator = VectorScaling(verbose=verbose)
self.val_split = val_split
self.n_jobs = n_jobs
self.verbose = verbose

View File

@ -10,7 +10,8 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm from tqdm import tqdm
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from classification.calibration import RecalibratedClassifier from classification.calibration import RecalibratedClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
VSCalibration
from quapy.classification.svmperf import SVMperf from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.method.base import BaseQuantifier, BinaryQuantifier
@ -138,8 +139,11 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
else: else:
key_prefix = 'base_estimator__' key_prefix = 'base_estimator__'
parameters = {key_prefix + k: v for k, v in parameters.items()} parameters = {key_prefix + k: v for k, v in parameters.items()}
elif isinstance(self.learner, RecalibratedClassifier):
parameters = {'estimator__' + k: v for k, v in parameters.items()}
self.learner.set_params(**parameters) self.learner.set_params(**parameters)
return self
# Helper # Helper
@ -511,22 +515,38 @@ class EMQ(AggregativeProbabilisticQuantifier):
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
value of the posterior probabilities of the training instances as suggested in value of the posterior probabilities of the training instances as suggested in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param recalib: a string indicating the method of recalibration. Available choices include "nbvs" (No-Bias Vector
Scaling), "bcts" (Bias-Corrected Temperature Scaling), "ts" (Temperature Scaling), and "vs" (Vector Scaling).
The default value is None, indicating no recalibration.
""" """
MAX_ITER = 1000 MAX_ITER = 1000
EPSILON = 1e-4 EPSILON = 1e-4
def __init__(self, learner: BaseEstimator, exact_train_prev=True): def __init__(self, learner: BaseEstimator, exact_train_prev=True, recalib=None):
self.learner = learner self.learner = learner
self.exact_train_prev = exact_train_prev self.exact_train_prev = exact_train_prev
self.recalib = recalib
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_learner=True):
if self.recalib is not None:
if self.recalib == 'nbvs':
self.learner = NBVSCalibration(self.learner)
elif self.recalib == 'bcts':
self.learner = BCTSCalibration(self.learner)
elif self.recalib == 'ts':
self.learner = TSCalibration(self.learner)
elif self.recalib == 'vs':
self.learner = VSCalibration(self.learner)
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
if self.exact_train_prev: if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else: else:
self.train_prevalence = qp.model_selection.cross_val_predict( self.train_prevalence = qp.model_selection.cross_val_predict(
quantifier=PCC(clone(self.learner)), quantifier=PCC(deepcopy(self.learner)),
data=data, data=data,
nfolds=3, nfolds=3,
random_state=0 random_state=0

View File

@ -323,7 +323,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
for vline in vlines: for vline in vlines:
ax.axvline(vline, 0, 1, linestyle='--', color='k') ax.axvline(vline, 0, 1, linestyle='--', color='k')
ax.set_xlim(min_x, max_x) ax.set_xlim(min_x, max_x)
if show_legend: if show_legend: