forked from moreo/QuaPy
179 lines
6.4 KiB
Python
179 lines
6.4 KiB
Python
import numpy as np
|
|
import quapy as qp
|
|
from sklearn import clone
|
|
from sklearn.metrics import confusion_matrix
|
|
import scipy
|
|
from scipy.sparse import issparse, csr_matrix
|
|
from data import LabelledCollection
|
|
from abc import ABC, abstractmethod
|
|
from sklearn.model_selection import cross_val_predict
|
|
|
|
|
|
class ConfusionMatrixPredictor(ABC):
|
|
"""
|
|
Abstract class of predictors of a confusion matrix for the performance of a classifier.
|
|
For the binary case, this accounts to predicting the 4-cell contingency table consisting of the
|
|
true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) that
|
|
most evaluation metrics make use of.
|
|
"""
|
|
@abstractmethod
|
|
def fit(self, train: LabelledCollection):
|
|
pass
|
|
|
|
@abstractmethod
|
|
def predict(self, test):
|
|
pass
|
|
|
|
|
|
class MLCMEstimator(ConfusionMatrixPredictor):
|
|
"""
|
|
The Maximum Likelihood Confusion Matrix Estimator is a method that relies on the IID assumption, and thus
|
|
computes, via k-FCV (or any other technique) the counters of the confusion matrix, assuming that those are
|
|
good estimates for the test case.
|
|
"""
|
|
def __init__(self, classifier, strategy='kfcv', **kwargs):
|
|
assert strategy in ['kfcv'], 'unknown strategy'
|
|
if strategy=='kfcv':
|
|
assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
|
|
self.classifier = classifier
|
|
self.strategy = strategy
|
|
self.kwargs = kwargs
|
|
|
|
def sout(self, msg):
|
|
if 'verbose' in self.kwargs:
|
|
print(msg)
|
|
|
|
def fit(self, train: LabelledCollection):
|
|
X, y = train.Xy
|
|
if self.strategy == 'kfcv':
|
|
k=self.kwargs['k']
|
|
n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
|
|
predict = self.kwargs['predict'] if 'predict' in self.kwargs else 'predict'
|
|
self.sout(f'{self.__class__.__name__}: '
|
|
f'running cross_val_predict with k={k} n_jobs={n_jobs} predict={predict}')
|
|
predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method=predict)
|
|
self.conf_matrix = confusion_matrix(y, predictions, labels=train.classes_)
|
|
return self
|
|
|
|
def predict(self, test):
|
|
"""
|
|
This method disregards the test set, under the assumption that it is IID wrt the training. This meaning that
|
|
the confusion matrix for the test data should coincide with the one computed for training (using any cross
|
|
validation strategy).
|
|
|
|
:param test: test collection (ignored)
|
|
:return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
|
|
"""
|
|
return self.conf_matrix
|
|
|
|
|
|
class UpperBound(ConfusionMatrixPredictor):
|
|
def __init__(self, classifier, y_test):
|
|
self.classifier = classifier
|
|
self.y_test = y_test
|
|
|
|
def fit(self, train: LabelledCollection):
|
|
self.classifier.fit(*train.Xy)
|
|
self.classes = train.classes_
|
|
return self
|
|
|
|
def show_true_labels(self, y_test):
|
|
self.y_test = y_test
|
|
|
|
def predict(self, test):
|
|
predictions = self.classifier.predict(test)
|
|
return confusion_matrix(self.y_test, predictions, labels=self.classes)
|
|
|
|
|
|
def get_counters(y_true, y_pred):
|
|
counters = np.full(shape=y_true.shape, fill_value=-1)
|
|
counters[np.logical_and(y_true == 1, y_pred == 1)] = 0
|
|
counters[np.logical_and(y_true == 1, y_pred == 0)] = 1
|
|
counters[np.logical_and(y_true == 0, y_pred == 1)] = 2
|
|
counters[np.logical_and(y_true == 0, y_pred == 0)] = 3
|
|
class_map = {
|
|
0:'tp',
|
|
1:'fn',
|
|
2:'fp',
|
|
3:'tn'
|
|
}
|
|
return counters, class_map
|
|
|
|
|
|
def safehstack(matrix, posteriors):
|
|
if issparse(matrix):
|
|
instances = csr_matrix(scipy.sparse.hstack([matrix, posteriors]))
|
|
else:
|
|
instances = np.hstack([matrix, posteriors])
|
|
return instances
|
|
|
|
|
|
class QuantificationCMPredictor(ConfusionMatrixPredictor):
|
|
"""
|
|
"""
|
|
def __init__(self, classifier, quantifier, strategy='kfcv', **kwargs):
|
|
assert strategy in ['kfcv'], 'unknown strategy'
|
|
if strategy=='kfcv':
|
|
assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
|
|
self.classifier = clone(classifier)
|
|
self.quantifier = quantifier
|
|
self.strategy = strategy
|
|
self.kwargs = kwargs
|
|
|
|
def sout(self, msg):
|
|
if 'verbose' in self.kwargs:
|
|
print(msg)
|
|
|
|
def fit(self, train: LabelledCollection):
|
|
X, y = train.Xy
|
|
if self.strategy == 'kfcv':
|
|
k=self.kwargs['k']
|
|
n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
|
|
self.sout(f'{self.__class__.__name__}: '
|
|
f'running cross_val_predict with k={k} n_jobs={n_jobs}')
|
|
predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict')
|
|
posteriors = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict_proba')
|
|
self.classifier.fit(X, y)
|
|
instances = safehstack(train.instances, posteriors)
|
|
counters, class_map = get_counters(train.labels, predictions)
|
|
q_data = LabelledCollection(instances=instances, labels=counters, classes_=[0,1,2,3])
|
|
print('counters prevalence', q_data.counts())
|
|
self.quantifier.fit(q_data)
|
|
return self
|
|
|
|
def predict(self, test):
|
|
"""
|
|
|
|
:param test: test collection (ignored)
|
|
:return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
|
|
"""
|
|
posteriors = self.classifier.predict_proba(test)
|
|
instances = safehstack(test, posteriors)
|
|
counters = self.quantifier.quantify(instances)
|
|
tp, fn, fp, tn = counters
|
|
conf_matrix = np.asarray([[tn, fp], [fn, tp]])
|
|
return conf_matrix
|
|
|
|
def quantify(self, test):
|
|
posteriors = self.classifier.predict_proba(test)
|
|
instances = safehstack(test, posteriors)
|
|
counters = self.quantifier.quantify(instances)
|
|
tp, fn, fp, tn = counters
|
|
den_tpr = (tp+fn)
|
|
if den_tpr>0:
|
|
tpr = tp/den_tpr
|
|
else:
|
|
tpr = 1
|
|
|
|
den_fpr = (fp+tn)
|
|
if den_fpr>0:
|
|
fpr = fp / den_fpr
|
|
else:
|
|
fpr = 0
|
|
|
|
pcc = posteriors.sum(axis=0)[1]
|
|
pacc = (pcc-fpr)/(tpr-fpr)
|
|
pacc = np.clip(pacc, 0, 1)
|
|
|
|
q = tp+fn
|
|
return q |