optimization optional, automatic download and unzip of corpus, calibration of Esuli
This commit is contained in:
parent
dc810272b2
commit
d1d4e08314
|
|
@ -56,7 +56,16 @@ def main():
|
|||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10)
|
||||
if args.C is None:
|
||||
params = {'C': np.logspace(-4, +3, 8)}
|
||||
C = 1.
|
||||
else:
|
||||
params = None
|
||||
C = args.C
|
||||
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
|
||||
av = AuthorshipVerificator(C=C, params=params)
|
||||
av.fit(Xtr, ytr)
|
||||
|
||||
if args.unknown:
|
||||
|
|
@ -73,6 +82,7 @@ def main():
|
|||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||
counters.append((tp, fp, fn, tn))
|
||||
tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
|
||||
print(f'TP={tp} FP={fp} FN={fn} TN={tn}')
|
||||
|
||||
if args.loo:
|
||||
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
|
||||
|
|
@ -88,6 +98,7 @@ def main():
|
|||
|
||||
log.close()
|
||||
|
||||
|
||||
def tee(msg, log):
|
||||
print(msg)
|
||||
log.write(f'{msg}\n')
|
||||
|
|
@ -111,6 +122,8 @@ if __name__ == '__main__':
|
|||
help='path to the file of unknown paternity (default None)')
|
||||
parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
|
||||
help='path to the log file where to write the results (default ./results.txt)')
|
||||
parser.add_argument('--C', type=float, metavar='C', default=None,
|
||||
help='set the parameter C (trade off between error and margin) or leave as None to optimize')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
set -x
|
||||
|
||||
corpus='../MedLatin'
|
||||
|
||||
if [ ! -d $corpus ]; then
|
||||
curl -0 http://hlt.isti.cnr.it/medlatin/MedLatin.zip -o ../MedLatin.zip
|
||||
unzip ../MedLatin.zip -d ../
|
||||
rm ../MedLatin.zip
|
||||
fi
|
||||
|
||||
PY="python3 author_identification.py"
|
||||
MedLatin1="../MedLatin/Corpora/MedLatin1"
|
||||
MedLatin2="../MedLatin/Corpora/MedLatin2"
|
||||
EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
|
||||
EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"
|
||||
|
||||
$PY $MedLatin1 ALL --unknown $EP1 --loo --log ./results_EP1.txt
|
||||
$PY $MedLatin2 ALL --unknown $EP2 --loo --log ./results_EP2.txt
|
||||
25
src/model.py
25
src/model.py
|
|
@ -1,27 +1,28 @@
|
|||
from sklearn.metrics import make_scorer
|
||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.svm import *
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from data.features import *
|
||||
from util.calibration import CalibratedClassifierCV
|
||||
from util.evaluation import f1, get_counters
|
||||
|
||||
|
||||
class AuthorshipVerificator:
|
||||
|
||||
def __init__(self, nfolds=10,
|
||||
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
|
||||
params={'C': np.logspace(-4, +3, 8)},
|
||||
C=1.,
|
||||
author_name=None):
|
||||
self.nfolds = nfolds
|
||||
self.params = params
|
||||
self.author_name = author_name if author_name else 'this author'
|
||||
self.classifier = LogisticRegression()
|
||||
self.classifier = LogisticRegression(C=C, class_weight='balanced')
|
||||
|
||||
def fit(self, X, y):
|
||||
y = np.asarray(y)
|
||||
positive_examples = y.sum()
|
||||
if positive_examples >= self.nfolds:
|
||||
if positive_examples >= self.nfolds and self.params is not None:
|
||||
print('optimizing {}'.format(self.classifier.__class__.__name__))
|
||||
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
|
||||
folds = list(StratifiedKFold(n_splits=self.nfolds, shuffle=True, random_state=42).split(X, y))
|
||||
self.estimator = GridSearchCV(
|
||||
self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
|
||||
)
|
||||
|
|
@ -35,6 +36,9 @@ class AuthorshipVerificator:
|
|||
print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})')
|
||||
self.estimator = self.estimator.best_estimator_
|
||||
|
||||
self.estimator = CalibratedClassifierCV(base_estimator=self.estimator, cv=self.nfolds, ensemble=False)
|
||||
self.estimator.fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
|
||||
|
|
@ -50,9 +54,14 @@ class AuthorshipVerificator:
|
|||
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||
|
||||
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||
missclassified = '\n'.join(files[scores == 0].tolist())
|
||||
missclassified = files[scores == 0].tolist()
|
||||
#if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0:
|
||||
# missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1]
|
||||
# missclassified_prob = missclassified_prob.flatten().tolist()
|
||||
# missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)]
|
||||
print('missclassified texts:')
|
||||
print(missclassified)
|
||||
print('\n'.join(missclassified))
|
||||
|
||||
|
||||
if counters and test_lowest_index_only:
|
||||
yfull_true = y[:len(folds)]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,589 @@
|
|||
"""Calibration of predicted probabilities."""
|
||||
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# Balazs Kegl <balazs.kegl@gmail.com>
|
||||
# Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# Mathieu Blondel <mathieu@mblondel.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from inspect import signature
|
||||
|
||||
from math import log
|
||||
import numpy as np
|
||||
|
||||
from scipy.special import expit
|
||||
from scipy.special import xlogy
|
||||
from scipy.optimize import fmin_bfgs
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from sklearn.base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
|
||||
MetaEstimatorMixin)
|
||||
from sklearn.preprocessing import label_binarize, LabelBinarizer
|
||||
from sklearn.utils import check_array, indexable, column_or_1d
|
||||
from sklearn.utils.validation import check_is_fitted, check_consistent_length
|
||||
from sklearn.utils.validation import _check_sample_weight
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.model_selection import check_cv, cross_val_predict
|
||||
from sklearn.utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class CalibratedClassifierCV(BaseEstimator, ClassifierMixin,
|
||||
MetaEstimatorMixin):
|
||||
"""Probability calibration with isotonic regression or logistic regression.
|
||||
The calibration is based on the :term:`decision_function` method of the
|
||||
`base_estimator` if it exists, else on :term:`predict_proba`.
|
||||
Read more in the :ref:`User Guide <calibration>`.
|
||||
Parameters
|
||||
----------
|
||||
base_estimator : instance BaseEstimator
|
||||
The classifier whose output need to be calibrated to provide more
|
||||
accurate `predict_proba` outputs.
|
||||
method : 'sigmoid' or 'isotonic'
|
||||
The method to use for calibration. Can be 'sigmoid' which
|
||||
corresponds to Platt's method (i.e. a logistic regression model) or
|
||||
'isotonic' which is a non-parametric approach. It is not advised to
|
||||
use isotonic calibration with too few calibration samples
|
||||
``(<<1000)`` since it tends to overfit.
|
||||
cv : integer, cross-validation generator, iterable or "prefit", optional
|
||||
Determines the cross-validation splitting strategy.
|
||||
Possible inputs for cv are:
|
||||
- None, to use the default 5-fold cross-validation,
|
||||
- integer, to specify the number of folds.
|
||||
- :term:`CV splitter`,
|
||||
- An iterable yielding (train, test) splits as arrays of indices.
|
||||
For integer/None inputs, if ``y`` is binary or multiclass,
|
||||
:class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
|
||||
neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
|
||||
is used.
|
||||
Refer :ref:`User Guide <cross_validation>` for the various
|
||||
cross-validation strategies that can be used here.
|
||||
If "prefit" is passed, it is assumed that `base_estimator` has been
|
||||
fitted already and all data is used for calibration.
|
||||
.. versionchanged:: 0.22
|
||||
``cv`` default value if None changed from 3-fold to 5-fold.
|
||||
ensemble : bool, optional
|
||||
When ``cv`` is not "prefit", it determines how the final estimator
|
||||
is fit.
|
||||
If ``ensemble`` is True (default), an estimator (clone of
|
||||
base_estimator) is fit and calibrated on each fold. The final
|
||||
estimator is an ensemble that averages the predicted probabilities
|
||||
of all such estimators.
|
||||
When ``ensemble`` is False, the cross validation generator is used to
|
||||
compute predictions (using :func:`cross_val_predict`), and the union
|
||||
of these predictions is used for training the sigmoid or isotonic
|
||||
model. The ``base_estimator`` is then fit on the whole data.
|
||||
Attributes
|
||||
----------
|
||||
classes_ : array, shape (n_classes)
|
||||
The class labels.
|
||||
calibrated_classifiers_ : list (len() equal to cv or 1 if cv == "prefit" \
|
||||
or ensemble == False)
|
||||
When ``ensemble`` is True, a list of calibrated classifiers, one for
|
||||
each crossvalidation fold, which has been fitted on all but the
|
||||
validation fold and calibrated on the validation fold.
|
||||
When ``ensemble`` is False, or when using "prefit", the list contains
|
||||
the single calibrated classifier produced by the fit method.
|
||||
References
|
||||
----------
|
||||
.. [1] Obtaining calibrated probability estimates from decision trees
|
||||
and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
|
||||
.. [2] Transforming Classifier Scores into Accurate Multiclass
|
||||
Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
|
||||
.. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
|
||||
Regularized Likelihood Methods, J. Platt, (1999)
|
||||
.. [4] Predicting Good Probabilities with Supervised Learning,
|
||||
A. Niculescu-Mizil & R. Caruana, ICML 2005
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, base_estimator=None, *, method='sigmoid', cv=None,
|
||||
ensemble=True):
|
||||
self.base_estimator = base_estimator
|
||||
self.method = method
|
||||
self.cv = cv
|
||||
self.ensemble = ensemble
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the calibrated model
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : array-like, shape (n_samples,)
|
||||
Target values.
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
X, y = indexable(X, y)
|
||||
le = LabelBinarizer().fit(y)
|
||||
self.classes_ = le.classes_
|
||||
|
||||
# Check that each cross-validation fold can have at least one
|
||||
# example per class
|
||||
n_folds = self.cv if isinstance(self.cv, int) \
|
||||
else self.cv.n_folds if hasattr(self.cv, "n_folds") else None
|
||||
if n_folds and \
|
||||
np.any([np.sum(y == class_) < n_folds for class_ in
|
||||
self.classes_]):
|
||||
raise ValueError("Requesting %d-fold cross-validation but provided"
|
||||
" less than %d examples for at least one class."
|
||||
% (n_folds, n_folds))
|
||||
|
||||
self.calibrated_classifiers_ = []
|
||||
if self.base_estimator is None:
|
||||
# we want all classifiers that don't expose a random_state
|
||||
# to be deterministic (and we don't want to expose this one).
|
||||
base_estimator = LinearSVC(random_state=0)
|
||||
else:
|
||||
base_estimator = self.base_estimator
|
||||
|
||||
if self.cv == "prefit":
|
||||
calibrated_classifier = _CalibratedClassifier(
|
||||
base_estimator, method=self.method)
|
||||
calibrated_classifier.fit(X, y, sample_weight)
|
||||
self.calibrated_classifiers_.append(calibrated_classifier)
|
||||
else:
|
||||
cv = check_cv(self.cv, y, classifier=True)
|
||||
fit_parameters = signature(base_estimator.fit).parameters
|
||||
base_estimator_supports_sw = "sample_weight" in fit_parameters
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
if not base_estimator_supports_sw:
|
||||
estimator_name = type(base_estimator).__name__
|
||||
warnings.warn("Since %s does not support sample_weights, "
|
||||
"sample weights will only be used for the "
|
||||
"calibration itself." % estimator_name)
|
||||
if self.ensemble:
|
||||
for train, test in cv.split(X, y):
|
||||
this_estimator = clone(base_estimator)
|
||||
|
||||
if sample_weight is not None and \
|
||||
base_estimator_supports_sw:
|
||||
this_estimator.fit(X[train], y[train],
|
||||
sample_weight=sample_weight[train])
|
||||
else:
|
||||
this_estimator.fit(X[train], y[train])
|
||||
|
||||
calibrated_classifier = _CalibratedClassifier(
|
||||
this_estimator, method=self.method,
|
||||
classes=self.classes_)
|
||||
sw = None if sample_weight is None else sample_weight[test]
|
||||
calibrated_classifier.fit(X[test], y[test],
|
||||
sample_weight=sw)
|
||||
self.calibrated_classifiers_.append(calibrated_classifier)
|
||||
else:
|
||||
if hasattr(base_estimator, "decision_function"):
|
||||
base_estimator_method = "decision_function"
|
||||
elif hasattr(base_estimator, "predict_proba"):
|
||||
base_estimator_method = "predict_proba"
|
||||
else:
|
||||
raise RuntimeError('classifier has no decision_function '
|
||||
'or predict_proba method.')
|
||||
predictions = cross_val_predict(base_estimator, X, y, cv=cv,
|
||||
method=base_estimator_method)
|
||||
this_estimator = clone(base_estimator)
|
||||
if sample_weight is not None and base_estimator_supports_sw:
|
||||
this_estimator.\
|
||||
fit(X, y, sample_weight=sample_weight)
|
||||
else:
|
||||
this_estimator.fit(X, y)
|
||||
calibrated_classifier = \
|
||||
_CalibratedClassifier(this_estimator, method=self.method,
|
||||
classes=self.classes_,
|
||||
predictions_in_X=True)
|
||||
if hasattr(this_estimator, "decision_function"):
|
||||
if predictions.ndim == 1:
|
||||
predictions = predictions[:, np.newaxis]
|
||||
elif hasattr(this_estimator, "predict_proba"):
|
||||
if len(self.classes_) == 2:
|
||||
predictions = predictions[:, 1:]
|
||||
calibrated_classifier.fit(predictions, y, sample_weight)
|
||||
self.calibrated_classifiers_.append(calibrated_classifier)
|
||||
return self
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Posterior probabilities of classification
|
||||
This function returns posterior probabilities of classification
|
||||
according to each class on an array of test vectors X.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The samples.
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples, n_classes)
|
||||
The predicted probas.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse=['csc', 'csr', 'coo'],
|
||||
force_all_finite=False)
|
||||
# Compute the arithmetic mean of the predictions of the calibrated
|
||||
# classifiers
|
||||
mean_proba = np.zeros((X.shape[0], len(self.classes_)))
|
||||
for calibrated_classifier in self.calibrated_classifiers_:
|
||||
proba = calibrated_classifier.predict_proba(X)
|
||||
mean_proba += proba
|
||||
|
||||
mean_proba /= len(self.calibrated_classifiers_)
|
||||
|
||||
return mean_proba
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the target of new samples. The predicted class is the
|
||||
class that has the highest probability, and can thus be different
|
||||
from the prediction of the uncalibrated classifier.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The samples.
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples,)
|
||||
The predicted class.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
|
||||
|
||||
|
||||
class _CalibratedClassifier:
|
||||
"""Probability calibration with isotonic regression or sigmoid.
|
||||
It assumes that base_estimator has already been fit, and trains the
|
||||
calibration on the input set of the fit function. Note that this class
|
||||
should not be used as an estimator directly. Use CalibratedClassifierCV
|
||||
with cv="prefit" instead.
|
||||
Parameters
|
||||
----------
|
||||
base_estimator : instance BaseEstimator
|
||||
The classifier whose output decision function needs to be calibrated
|
||||
to offer more accurate predict_proba outputs. No default value since
|
||||
it has to be an already fitted estimator.
|
||||
method : 'sigmoid' | 'isotonic'
|
||||
The method to use for calibration. Can be 'sigmoid' which
|
||||
corresponds to Platt's method or 'isotonic' which is a
|
||||
non-parametric approach based on isotonic regression.
|
||||
classes : array-like, shape (n_classes,), optional
|
||||
Contains unique classes used to fit the base estimator.
|
||||
if None, then classes is extracted from the given target values
|
||||
in fit().
|
||||
predictions_in_X : bool, optional
|
||||
When False (default), ``X`` are the element to be classified, and
|
||||
predictions are determined applying the ``base_estimator`` to
|
||||
``X``.
|
||||
When True, ``X`` already contains predictions.
|
||||
See also
|
||||
--------
|
||||
CalibratedClassifierCV
|
||||
References
|
||||
----------
|
||||
.. [1] Obtaining calibrated probability estimates from decision trees
|
||||
and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
|
||||
.. [2] Transforming Classifier Scores into Accurate Multiclass
|
||||
Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
|
||||
.. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
|
||||
Regularized Likelihood Methods, J. Platt, (1999)
|
||||
.. [4] Predicting Good Probabilities with Supervised Learning,
|
||||
A. Niculescu-Mizil & R. Caruana, ICML 2005
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, base_estimator, *, method='sigmoid', classes=None,
|
||||
predictions_in_X=False):
|
||||
self.base_estimator = base_estimator
|
||||
self.method = method
|
||||
self.classes = classes
|
||||
self.predictions_in_X_ = predictions_in_X
|
||||
|
||||
def _preproc(self, X):
|
||||
n_classes = len(self.classes_)
|
||||
if hasattr(self.base_estimator, "decision_function"):
|
||||
df = self.base_estimator.decision_function(X)
|
||||
if df.ndim == 1:
|
||||
df = df[:, np.newaxis]
|
||||
elif hasattr(self.base_estimator, "predict_proba"):
|
||||
df = self.base_estimator.predict_proba(X)
|
||||
if n_classes == 2:
|
||||
df = df[:, 1:]
|
||||
else:
|
||||
raise RuntimeError('classifier has no decision_function or '
|
||||
'predict_proba method.')
|
||||
|
||||
idx_pos_class = self.label_encoder_.\
|
||||
transform(self.base_estimator.classes_)
|
||||
|
||||
return df, idx_pos_class
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Calibrate the fitted model
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : array-like, shape (n_samples,)
|
||||
Target values.
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
|
||||
self.label_encoder_ = LabelEncoder()
|
||||
if self.classes is None:
|
||||
self.label_encoder_.fit(y)
|
||||
else:
|
||||
self.label_encoder_.fit(self.classes)
|
||||
|
||||
self.classes_ = self.label_encoder_.classes_
|
||||
Y = label_binarize(y, classes=self.classes_)
|
||||
|
||||
if not self.predictions_in_X_:
|
||||
df, idx_pos_class = self._preproc(X)
|
||||
else:
|
||||
df = X
|
||||
idx_pos_class = \
|
||||
self.label_encoder_.transform(self.base_estimator.classes_)
|
||||
self.calibrators_ = []
|
||||
|
||||
for k, this_df in zip(idx_pos_class, df.T):
|
||||
if self.method == 'isotonic':
|
||||
calibrator = IsotonicRegression(out_of_bounds='clip')
|
||||
elif self.method == 'sigmoid':
|
||||
calibrator = _SigmoidCalibration()
|
||||
else:
|
||||
raise ValueError('method should be "sigmoid" or '
|
||||
'"isotonic". Got %s.' % self.method)
|
||||
calibrator.fit(this_df, Y[:, k], sample_weight)
|
||||
self.calibrators_.append(calibrator)
|
||||
|
||||
return self
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Posterior probabilities of classification
|
||||
This function returns posterior probabilities of classification
|
||||
according to each class on an array of test vectors X.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The samples.
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples, n_classes)
|
||||
The predicted probas. Can be exact zeros.
|
||||
"""
|
||||
n_classes = len(self.classes_)
|
||||
proba = np.zeros((X.shape[0], n_classes))
|
||||
|
||||
df, idx_pos_class = self._preproc(X)
|
||||
|
||||
for k, this_df, calibrator in \
|
||||
zip(idx_pos_class, df.T, self.calibrators_):
|
||||
if n_classes == 2:
|
||||
k += 1
|
||||
proba[:, k] = calibrator.predict(this_df)
|
||||
|
||||
# Normalize the probabilities
|
||||
if n_classes == 2:
|
||||
proba[:, 0] = 1. - proba[:, 1]
|
||||
else:
|
||||
proba /= np.sum(proba, axis=1)[:, np.newaxis]
|
||||
|
||||
# XXX : for some reason all probas can be 0
|
||||
proba[np.isnan(proba)] = 1. / n_classes
|
||||
|
||||
# Deal with cases where the predicted probability minimally exceeds 1.0
|
||||
proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0
|
||||
|
||||
return proba
|
||||
|
||||
|
||||
def _sigmoid_calibration(df, y, sample_weight=None):
|
||||
"""Probability Calibration with sigmoid method (Platt 2000)
|
||||
Parameters
|
||||
----------
|
||||
df : ndarray, shape (n_samples,)
|
||||
The decision function or predict proba for the samples.
|
||||
y : ndarray, shape (n_samples,)
|
||||
The targets.
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Returns
|
||||
-------
|
||||
a : float
|
||||
The slope.
|
||||
b : float
|
||||
The intercept.
|
||||
References
|
||||
----------
|
||||
Platt, "Probabilistic Outputs for Support Vector Machines"
|
||||
"""
|
||||
df = column_or_1d(df)
|
||||
y = column_or_1d(y)
|
||||
|
||||
F = df # F follows Platt's notations
|
||||
|
||||
# Bayesian priors (see Platt end of section 2.2)
|
||||
prior0 = float(np.sum(y <= 0))
|
||||
prior1 = y.shape[0] - prior0
|
||||
T = np.zeros(y.shape)
|
||||
T[y > 0] = (prior1 + 1.) / (prior1 + 2.)
|
||||
T[y <= 0] = 1. / (prior0 + 2.)
|
||||
T1 = 1. - T
|
||||
|
||||
def objective(AB):
|
||||
# From Platt (beginning of Section 2.2)
|
||||
P = expit(-(AB[0] * F + AB[1]))
|
||||
loss = -(xlogy(T, P) + xlogy(T1, 1. - P))
|
||||
if sample_weight is not None:
|
||||
return (sample_weight * loss).sum()
|
||||
else:
|
||||
return loss.sum()
|
||||
|
||||
def grad(AB):
|
||||
# gradient of the objective function
|
||||
P = expit(-(AB[0] * F + AB[1]))
|
||||
TEP_minus_T1P = T - P
|
||||
if sample_weight is not None:
|
||||
TEP_minus_T1P *= sample_weight
|
||||
dA = np.dot(TEP_minus_T1P, F)
|
||||
dB = np.sum(TEP_minus_T1P)
|
||||
return np.array([dA, dB])
|
||||
|
||||
AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))])
|
||||
AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)
|
||||
return AB_[0], AB_[1]
|
||||
|
||||
|
||||
class _SigmoidCalibration(RegressorMixin, BaseEstimator):
|
||||
"""Sigmoid regression model.
|
||||
Attributes
|
||||
----------
|
||||
a_ : float
|
||||
The slope.
|
||||
b_ : float
|
||||
The intercept.
|
||||
"""
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model using X, y as training data.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples,)
|
||||
Training data.
|
||||
y : array-like, shape (n_samples,)
|
||||
Training target.
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
X = column_or_1d(X)
|
||||
y = column_or_1d(y)
|
||||
X, y = indexable(X, y)
|
||||
|
||||
self.a_, self.b_ = _sigmoid_calibration(X, y, sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, T):
|
||||
"""Predict new data by linear interpolation.
|
||||
Parameters
|
||||
----------
|
||||
T : array-like, shape (n_samples,)
|
||||
Data to predict from.
|
||||
Returns
|
||||
-------
|
||||
T_ : array, shape (n_samples,)
|
||||
The predicted data.
|
||||
"""
|
||||
T = column_or_1d(T)
|
||||
return expit(-(self.a_ * T + self.b_))
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5,
|
||||
strategy='uniform'):
|
||||
"""Compute true and predicted probabilities for a calibration curve.
|
||||
The method assumes the inputs come from a binary classifier, and
|
||||
discretize the [0, 1] interval into bins.
|
||||
Calibration curves may also be referred to as reliability diagrams.
|
||||
Read more in the :ref:`User Guide <calibration>`.
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True targets.
|
||||
y_prob : array-like of shape (n_samples,)
|
||||
Probabilities of the positive class.
|
||||
normalize : bool, default=False
|
||||
Whether y_prob needs to be normalized into the [0, 1] interval, i.e.
|
||||
is not a proper probability. If True, the smallest value in y_prob
|
||||
is linearly mapped onto 0 and the largest one onto 1.
|
||||
n_bins : int, default=5
|
||||
Number of bins to discretize the [0, 1] interval. A bigger number
|
||||
requires more data. Bins with no samples (i.e. without
|
||||
corresponding values in `y_prob`) will not be returned, thus the
|
||||
returned arrays may have less than `n_bins` values.
|
||||
strategy : {'uniform', 'quantile'}, default='uniform'
|
||||
Strategy used to define the widths of the bins.
|
||||
uniform
|
||||
The bins have identical widths.
|
||||
quantile
|
||||
The bins have the same number of samples and depend on `y_prob`.
|
||||
Returns
|
||||
-------
|
||||
prob_true : ndarray of shape (n_bins,) or smaller
|
||||
The proportion of samples whose class is the positive class, in each
|
||||
bin (fraction of positives).
|
||||
prob_pred : ndarray of shape (n_bins,) or smaller
|
||||
The mean predicted probability in each bin.
|
||||
References
|
||||
----------
|
||||
Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
|
||||
Probabilities With Supervised Learning, in Proceedings of the 22nd
|
||||
International Conference on Machine Learning (ICML).
|
||||
See section 4 (Qualitative Analysis of Predictions).
|
||||
"""
|
||||
y_true = column_or_1d(y_true)
|
||||
y_prob = column_or_1d(y_prob)
|
||||
check_consistent_length(y_true, y_prob)
|
||||
|
||||
if normalize: # Normalize predicted values into interval [0, 1]
|
||||
y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())
|
||||
elif y_prob.min() < 0 or y_prob.max() > 1:
|
||||
raise ValueError("y_prob has values outside [0, 1] and normalize is "
|
||||
"set to False.")
|
||||
|
||||
labels = np.unique(y_true)
|
||||
if len(labels) > 2:
|
||||
raise ValueError("Only binary classification is supported. "
|
||||
"Provided labels %s." % labels)
|
||||
y_true = label_binarize(y_true, classes=labels)[:, 0]
|
||||
|
||||
if strategy == 'quantile': # Determine bin edges by distribution of data
|
||||
quantiles = np.linspace(0, 1, n_bins + 1)
|
||||
bins = np.percentile(y_prob, quantiles * 100)
|
||||
bins[-1] = bins[-1] + 1e-8
|
||||
elif strategy == 'uniform':
|
||||
bins = np.linspace(0., 1. + 1e-8, n_bins + 1)
|
||||
else:
|
||||
raise ValueError("Invalid entry to 'strategy' input. Strategy "
|
||||
"must be either 'quantile' or 'uniform'.")
|
||||
|
||||
binids = np.digitize(y_prob, bins) - 1
|
||||
|
||||
bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))
|
||||
bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))
|
||||
bin_total = np.bincount(binids, minlength=len(bins))
|
||||
|
||||
nonzero = bin_total != 0
|
||||
prob_true = bin_true[nonzero] / bin_total[nonzero]
|
||||
prob_pred = bin_sums[nonzero] / bin_total[nonzero]
|
||||
|
||||
return prob_true, prob_pred
|
||||
Loading…
Reference in New Issue