From 22601a1952ee594999b604ec15ae8e8a3d515e61 Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Wed, 22 Nov 2023 19:23:40 +0100 Subject: [PATCH] collapse_false added, confidence refactored --- quacc/method/base.py | 101 +++++++++++++++++++------------------ quacc/method/confidence.py | 91 +++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 49 deletions(-) create mode 100644 quacc/method/confidence.py diff --git a/quacc/method/base.py b/quacc/method/base.py index 25f21ae..dcc19f8 100644 --- a/quacc/method/base.py +++ b/quacc/method/base.py @@ -8,6 +8,7 @@ from quapy.data import LabelledCollection from quapy.method.aggregative import BaseQuantifier from sklearn.base import BaseEstimator +import quacc.method.confidence as conf from quacc.data import ExtendedCollection, ExtendedData, ExtensionPolicy @@ -16,10 +17,11 @@ class BaseAccuracyEstimator(BaseQuantifier): self, classifier: BaseEstimator, quantifier: BaseQuantifier, + collapse_false=False, ): self.__check_classifier(classifier) self.quantifier = quantifier - self.extpol = ExtensionPolicy() + self.extpol = ExtensionPolicy(collapse_false=collapse_false) def __check_classifier(self, classifier): if not hasattr(classifier, "predict_proba"): @@ -36,10 +38,8 @@ class BaseAccuracyEstimator(BaseQuantifier): coll, pred_proba=pred_proba, extpol=self.extpol ) - def _extend_instances(self, instances: np.ndarray | sp.csr_matrix, pred_proba=None): - if pred_proba is None: - pred_proba = self.classifier.predict_proba(instances) - + def _extend_instances(self, instances: np.ndarray | sp.csr_matrix): + pred_proba = self.classifier.predict_proba(instances) return ExtendedData(instances, pred_proba=pred_proba, extpol=self.extpol) @abstractmethod @@ -50,16 +50,26 @@ class BaseAccuracyEstimator(BaseQuantifier): def estimate(self, instances, ext=False) -> np.ndarray: ... + @property + def collapse_false(self): + return self.extpol.collapse_false + class ConfidenceBasedAccuracyEstimator(BaseAccuracyEstimator): def __init__( self, classifier: BaseEstimator, quantifier: BaseQuantifier, + collapse_false=False, confidence=None, ): - super().__init__(classifier, quantifier) + super().__init__( + classifier=classifier, + quantifier=quantifier, + collapse_false=collapse_false, + ) self.__check_confidence(confidence) + self.calibrator = None def __check_confidence(self, confidence): if isinstance(confidence, str): @@ -69,46 +79,37 @@ class ConfidenceBasedAccuracyEstimator(BaseAccuracyEstimator): else: self.confidence = None - def __get_confidence(self): - def max_conf(probas): - _mc = np.max(probas, axis=-1) - _min = 1.0 / probas.shape[1] - _norm_mc = (_mc - _min) / (1.0 - _min) - return _norm_mc + def _fit_confidence(self, X, y, probas): + self.confidence_metrics = conf.get_metrics(self.confidence) + if self.confidence_metrics is None: + return - def entropy(probas): - _ent = np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1) - return _ent + for m in self.confidence_metrics: + m.fit(X, y, probas) - if self.confidence is None: - return [] + def _get_pred_ext(self, pred_proba: np.ndarray): + return pred_proba - __confs = { - "max_conf": max_conf, - "entropy": entropy, - } - return [__confs.get(c, None) for c in self.confidence] + def __get_ext( + self, X: np.ndarray | sp.csr_matrix, pred_proba: np.ndarray + ) -> np.ndarray: + if self.confidence_metrics is None or len(self.confidence_metrics) == 0: + return pred_proba - def __get_ext(self, pred_proba: np.ndarray) -> np.ndarray: - __confidence = self.__get_confidence() - - if __confidence is None or len(__confidence) == 0: - return None - - return np.concatenate( - [ - _f_conf(pred_proba).reshape((len(pred_proba), 1)) - for _f_conf in __confidence - if _f_conf is not None - ], + _conf_ext = np.concatenate( + [m.conf(X, pred_proba) for m in self.confidence_metrics], axis=1, ) + _pred_ext = self._get_pred_ext(pred_proba) + + return np.concatenate([_conf_ext, _pred_ext], axis=1) + def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection: if pred_proba is None: pred_proba = self.classifier.predict_proba(coll.X) - _ext = self.__get_ext(pred_proba) + _ext = self.__get_ext(coll.X, pred_proba) return ExtendedCollection.from_lc( coll, pred_proba=pred_proba, ext=_ext, extpol=self.extpol ) @@ -116,12 +117,9 @@ class ConfidenceBasedAccuracyEstimator(BaseAccuracyEstimator): def _extend_instances( self, instances: np.ndarray | sp.csr_matrix, - pred_proba=None, ) -> ExtendedData: - if pred_proba is None: - pred_proba = self.classifier.predict_proba(instances) - - _ext = self.__get_ext(pred_proba) + pred_proba = self.classifier.predict_proba(instances) + _ext = self.__get_ext(instances, pred_proba) return ExtendedData( instances, pred_proba=pred_proba, ext=_ext, extpol=self.extpol ) @@ -139,12 +137,17 @@ class MultiClassAccuracyEstimator(ConfidenceBasedAccuracyEstimator): classifier=classifier, quantifier=quantifier, confidence=confidence, + collapse_false=collapse_false, ) self.e_train = None - self.extpol = ExtensionPolicy(collapse_false=collapse_false) + + def _get_pred_ext(self, pred_proba: np.ndarray): + return np.argmax(pred_proba, axis=1, keepdims=True) def fit(self, train: LabelledCollection): - self.e_train = self.extend(train) + pred_proba = self.classifier.predict_proba(train.X) + self._fit_confidence(train.X, train.y, pred_proba) + self.e_train = self.extend(train, pred_proba=pred_proba) self.quantifier.fit(self.e_train) @@ -173,10 +176,6 @@ class MultiClassAccuracyEstimator(ConfidenceBasedAccuracyEstimator): estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0) return estim_prev - @property - def collapse_false(self): - return self.extpol.collapse_false - class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator): def __init__( @@ -184,6 +183,7 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator): classifier: BaseEstimator, quantifier: BaseAccuracyEstimator, confidence: str = None, + collapse_false=False, ): super().__init__( classifier=classifier, @@ -193,7 +193,9 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator): self.quantifiers = [] def fit(self, train: LabelledCollection | ExtendedCollection): - self.e_train = self.extend(train) + pred_proba = self.classifier.predict_proba(train.X) + self._fit_confidence(train.X, train.y, pred_proba) + self.e_train = self.extend(train, pred_proba=pred_proba) self.n_classes = self.e_train.n_classes e_trains = self.e_train.split_by_pred() @@ -217,7 +219,8 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator): norms = [s_i.shape[0] / len(e_inst) for s_i in s_inst] estim_prevs = self._quantify_helper(s_inst, norms) - estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten() + # estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten() + estim_prev = np.concatenate(estim_prevs.T) return estim_prev def _quantify_helper( @@ -232,7 +235,7 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator): else: estim_prevs.append(np.asarray([0.0, 0.0])) - return estim_prevs + return np.array(estim_prevs) BAE = BaseAccuracyEstimator diff --git a/quacc/method/confidence.py b/quacc/method/confidence.py new file mode 100644 index 0000000..17076d5 --- /dev/null +++ b/quacc/method/confidence.py @@ -0,0 +1,91 @@ +from typing import List + +import numpy as np +import scipy.sparse as sp +from sklearn.linear_model import LinearRegression + +import baselines.atc as atc + +__confs = {} + + +def metric(name): + def wrapper(cl): + __confs[name] = cl + return cl + + return wrapper + + +class ConfidenceMetric: + def fit(self, X, y, probas): + pass + + def conf(self, X, probas): + return probas + + +@metric("max_conf") +class MaxConf(ConfidenceMetric): + def conf(self, X, probas): + _mc = np.max(probas, axis=1, keepdims=True) + return _mc + + +@metric("entropy") +class Entropy(ConfidenceMetric): + def conf(self, X, probas): + _ent = np.sum( + np.multiply(probas, np.log(probas + 1e-20)), axis=1, keepdims=True + ) + return _ent + + +@metric("isoft") +class InverseSoftmax(ConfidenceMetric): + def conf(self, X, probas): + _probas = probas / np.sum(probas, axis=1, keepdims=True) + _probas = np.log(_probas) - np.mean(np.log(_probas), axis=1, keepdims=True) + return np.max(_probas, axis=1, keepdims=True) + + +@metric("threshold") +class Threshold(ConfidenceMetric): + def get_scores(self, probas, keepdims=False): + return np.max(probas, axis=1, keepdims=keepdims) + + def fit(self, X, y, probas): + scores = self.get_scores(probas) + _, self.threshold = atc.find_ATC_threshold(scores, y) + + def conf(self, X, probas): + scores = self.get_scores(probas, keepdims=True) + _exp = scores - self.threshold + return _exp + + +@metric("linreg") +class LinReg(ConfidenceMetric): + def extend(self, X, probas): + if sp.issparse(X): + return sp.hstack([X, probas]) + else: + return np.concatenate([X, probas], axis=1) + + def fit(self, X, y, probas): + reg_X = self.extend(X, probas) + reg_y = probas[np.arange(probas.shape[0]), y] + self.reg = LinearRegression() + self.reg.fit(reg_X, reg_y) + + def conf(self, X, probas): + reg_X = self.extend(X, probas) + return self.reg.predict(reg_X)[:, np.newaxis] + + +def get_metrics(names: List[str]): + if names is None: + return None + + __fnames = [n for n in names if n in __confs] + return [__confs[m]() for m in __fnames]