QuaPy/ClassifierAccuracy/deprecated/models_binary.py

import numpy as np
import quapy as qp
from sklearn import clone
from sklearn.metrics import confusion_matrix
import scipy
from scipy.sparse import issparse, csr_matrix
from data import LabelledCollection
from abc import ABC, abstractmethod
from sklearn.model_selection import cross_val_predict


class ConfusionMatrixPredictor(ABC):
    """
    Abstract class of predictors of a confusion matrix for the performance of a classifier.
    For the binary case, this accounts to predicting the 4-cell contingency table consisting of the
    true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) that
    most evaluation metrics make use of.
    """
    @abstractmethod
    def fit(self, train: LabelledCollection):
        pass

    @abstractmethod
    def predict(self, test):
        pass


class MLCMEstimator(ConfusionMatrixPredictor):
    """
    The Maximum Likelihood Confusion Matrix Estimator is a method that relies on the IID assumption, and thus
    computes, via k-FCV (or any other technique) the counters of the confusion matrix, assuming that those are
    good estimates for the test case.
    """
    def __init__(self, classifier, strategy='kfcv', **kwargs):
        assert strategy in ['kfcv'], 'unknown strategy'
        if strategy=='kfcv':
            assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
        self.classifier = classifier
        self.strategy = strategy
        self.kwargs = kwargs

    def sout(self, msg):
        if 'verbose' in self.kwargs:
            print(msg)

    def fit(self, train: LabelledCollection):
        X, y = train.Xy
        if self.strategy == 'kfcv':
            k=self.kwargs['k']
            n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
            predict = self.kwargs['predict'] if 'predict' in self.kwargs else 'predict'
            self.sout(f'{self.__class__.__name__}: '
                      f'running cross_val_predict with k={k} n_jobs={n_jobs} predict={predict}')
            predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method=predict)
            self.conf_matrix = confusion_matrix(y, predictions, labels=train.classes_)
        return self

    def predict(self, test):
        """
        This method disregards the test set, under the assumption that it is IID wrt the training. This meaning that
        the confusion matrix for the test data should coincide with the one computed for training (using any cross
        validation strategy).

        :param test: test collection (ignored)
        :return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
        """
        return self.conf_matrix


class UpperBound(ConfusionMatrixPredictor):
    def __init__(self, classifier, y_test):
        self.classifier = classifier
        self.y_test = y_test

    def fit(self, train: LabelledCollection):
        self.classifier.fit(*train.Xy)
        self.classes = train.classes_
        return self

    def show_true_labels(self, y_test):
        self.y_test = y_test

    def predict(self, test):
        predictions = self.classifier.predict(test)
        return confusion_matrix(self.y_test, predictions, labels=self.classes)


def get_counters(y_true, y_pred):
    counters = np.full(shape=y_true.shape, fill_value=-1)
    counters[np.logical_and(y_true == 1, y_pred == 1)] = 0
    counters[np.logical_and(y_true == 1, y_pred == 0)] = 1
    counters[np.logical_and(y_true == 0, y_pred == 1)] = 2
    counters[np.logical_and(y_true == 0, y_pred == 0)] = 3
    class_map = {
        0:'tp',
        1:'fn',
        2:'fp',
        3:'tn'
    }
    return counters, class_map


def safehstack(matrix, posteriors):
    if issparse(matrix):
        instances = csr_matrix(scipy.sparse.hstack([matrix, posteriors]))
    else:
        instances = np.hstack([matrix, posteriors])
    return instances


class QuantificationCMPredictor(ConfusionMatrixPredictor):
    """
    """
    def __init__(self, classifier, quantifier, strategy='kfcv', **kwargs):
        assert strategy in ['kfcv'], 'unknown strategy'
        if strategy=='kfcv':
            assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
        self.classifier = clone(classifier)
        self.quantifier = quantifier
        self.strategy = strategy
        self.kwargs = kwargs

    def sout(self, msg):
        if 'verbose' in self.kwargs:
            print(msg)

    def fit(self, train: LabelledCollection):
        X, y = train.Xy
        if self.strategy == 'kfcv':
            k=self.kwargs['k']
            n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
            self.sout(f'{self.__class__.__name__}: '
                      f'running cross_val_predict with k={k} n_jobs={n_jobs}')
            predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict')
            posteriors  = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict_proba')
            self.classifier.fit(X, y)
            instances = safehstack(train.instances, posteriors)
            counters, class_map = get_counters(train.labels, predictions)
            q_data = LabelledCollection(instances=instances, labels=counters, classes_=[0,1,2,3])
            print('counters prevalence', q_data.counts())
            self.quantifier.fit(q_data)
        return self

    def predict(self, test):
        """

        :param test: test collection (ignored)
        :return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
        """
        posteriors = self.classifier.predict_proba(test)
        instances = safehstack(test, posteriors)
        counters = self.quantifier.quantify(instances)
        tp, fn, fp, tn = counters
        conf_matrix = np.asarray([[tn, fp], [fn, tp]])
        return conf_matrix

    def quantify(self, test):
        posteriors = self.classifier.predict_proba(test)
        instances = safehstack(test, posteriors)
        counters = self.quantifier.quantify(instances)
        tp, fn, fp, tn = counters
        den_tpr = (tp+fn)
        if den_tpr>0:
            tpr = tp/den_tpr
        else:
            tpr = 1

        den_fpr = (fp+tn)
        if den_fpr>0:
            fpr = fp / den_fpr
        else:
            fpr = 0

        pcc = posteriors.sum(axis=0)[1]
        pacc = (pcc-fpr)/(tpr-fpr)
        pacc = np.clip(pacc, 0, 1)

        q = tp+fn
        return q