refactored for multiclass
This commit is contained in:
parent
5d0ecfda39
commit
d783989ebc
194
quacc/data.py
194
quacc/data.py
|
@ -20,34 +20,91 @@ from quapy.data import LabelledCollection
|
|||
#
|
||||
|
||||
|
||||
def _split_index_by_pred(pred_proba: np.ndarray) -> List[np.ndarray]:
|
||||
_pred_label = np.argmax(pred_proba, axis=1)
|
||||
return [(_pred_label == cl).nonzero()[0] for cl in np.arange(pred_proba.shape[1])]
|
||||
|
||||
|
||||
class ExtensionPolicy:
|
||||
def __init__(self, collapse_false=False):
|
||||
def __init__(self, collapse_false=False, group_false=False, dense=False):
|
||||
self.collapse_false = collapse_false
|
||||
self.group_false = group_false
|
||||
self.dense = dense
|
||||
|
||||
def qclasses(self, nbcl):
|
||||
if self.collapse_false:
|
||||
return np.arange(nbcl + 1)
|
||||
else:
|
||||
return np.arange(nbcl**2)
|
||||
elif self.group_false:
|
||||
return np.arange(nbcl * 2)
|
||||
|
||||
return np.arange(nbcl**2)
|
||||
|
||||
def eclasses(self, nbcl):
|
||||
return np.arange(nbcl**2)
|
||||
|
||||
def tfp_classes(self, nbcl):
|
||||
if self.group_false:
|
||||
return np.arange(2)
|
||||
else:
|
||||
return np.arange(nbcl)
|
||||
|
||||
def matrix_idx(self, nbcl):
|
||||
if self.collapse_false:
|
||||
_idxs = np.array([[i, i] for i in range(nbcl)] + [[0, 1]]).T
|
||||
return tuple(_idxs)
|
||||
elif self.group_false:
|
||||
diag_idxs = np.diag_indices(nbcl)
|
||||
sub_diag_idxs = tuple(
|
||||
np.array([((i + 1) % nbcl, i) for i in range(nbcl)]).T
|
||||
)
|
||||
return tuple(np.concatenate(axis) for axis in zip(diag_idxs, sub_diag_idxs))
|
||||
# def mask_fn(m, k):
|
||||
# n = m.shape[0]
|
||||
# d = np.diag(np.tile(1, n))
|
||||
# d[tuple(zip(*[(i, (i + 1) % n) for i in range(n)]))] = 1
|
||||
# return d
|
||||
|
||||
# _mi = np.mask_indices(nbcl, mask_func=mask_fn)
|
||||
# print(_mi)
|
||||
# return _mi
|
||||
else:
|
||||
_idxs = np.indices((nbcl, nbcl))
|
||||
return _idxs[0].flatten(), _idxs[1].flatten()
|
||||
|
||||
def ext_lbl(self, nbcl):
|
||||
if self.collapse_false:
|
||||
return np.vectorize(
|
||||
lambda t, p: t if t == p else nbcl, signature="(),()->()"
|
||||
)
|
||||
|
||||
def cf_fun(t, p):
|
||||
return t if t == p else nbcl
|
||||
|
||||
return np.vectorize(cf_fun, signature="(),()->()")
|
||||
|
||||
elif self.group_false:
|
||||
|
||||
def gf_fun(t, p):
|
||||
# if t < nbcl - 1:
|
||||
# return t * 2 if t == p else (t * 2) + 1
|
||||
# else:
|
||||
# return t * 2 if t != p else (t * 2) + 1
|
||||
return p if t == p else nbcl + p
|
||||
|
||||
return np.vectorize(gf_fun, signature="(),()->()")
|
||||
|
||||
else:
|
||||
return np.vectorize(lambda t, p: t * nbcl + p, signature="(),()->()")
|
||||
|
||||
def default_fn(t, p):
|
||||
return t * nbcl + p
|
||||
|
||||
return np.vectorize(default_fn, signature="(),()->()")
|
||||
|
||||
def true_lbl_from_pred(self, nbcl):
|
||||
if self.group_false:
|
||||
return np.vectorize(lambda t, p: 0 if t == p else 1, signature="(),()->()")
|
||||
else:
|
||||
return np.vectorize(lambda t, p: t, signature="(),()->()")
|
||||
|
||||
def can_f1(self, nbcl):
|
||||
return nbcl == 2 or (not self.collapse_false and not self.group_false)
|
||||
|
||||
|
||||
class ExtendedData:
|
||||
|
@ -75,10 +132,13 @@ class ExtendedData:
|
|||
to_append = pred_proba
|
||||
|
||||
if isinstance(instances, sp.csr_matrix):
|
||||
_to_append = sp.csr_matrix(to_append)
|
||||
n_x = sp.hstack([instances, _to_append])
|
||||
if self.extpol.dense:
|
||||
n_x = to_append
|
||||
else:
|
||||
n_x = sp.hstack([instances, sp.csr_matrix(to_append)], format="csr")
|
||||
elif isinstance(instances, np.ndarray):
|
||||
n_x = np.concatenate((instances, to_append), axis=1)
|
||||
_concat = [instances, to_append] if not self.extpol.dense else [to_append]
|
||||
n_x = np.concatenate(_concat, axis=1)
|
||||
else:
|
||||
raise ValueError("Unsupported matrix format")
|
||||
|
||||
|
@ -88,30 +148,25 @@ class ExtendedData:
|
|||
def X(self):
|
||||
return self.instances
|
||||
|
||||
def __split_index_by_pred(self) -> List[np.ndarray]:
|
||||
_pred_label = np.argmax(self.pred_proba_, axis=1)
|
||||
@property
|
||||
def nbcl(self):
|
||||
return self.pred_proba_.shape[1]
|
||||
|
||||
return [
|
||||
(_pred_label == cl).nonzero()[0]
|
||||
for cl in np.arange(self.pred_proba_.shape[1])
|
||||
]
|
||||
|
||||
def split_by_pred(self, return_indexes=False):
|
||||
def split_by_pred(self, _indexes: List[np.ndarray] | None = None):
|
||||
def _empty_matrix():
|
||||
if isinstance(self.instances, np.ndarray):
|
||||
return np.asarray([], dtype=int)
|
||||
elif isinstance(self.instances, sp.csr_matrix):
|
||||
return sp.csr_matrix(np.empty((0, 0), dtype=int))
|
||||
|
||||
_indexes = self.__split_index_by_pred()
|
||||
if _indexes is None:
|
||||
_indexes = _split_index_by_pred(self.pred_proba_)
|
||||
|
||||
_instances = [
|
||||
self.instances[ind] if ind.shape[0] > 0 else _empty_matrix()
|
||||
for ind in _indexes
|
||||
]
|
||||
|
||||
if return_indexes:
|
||||
return _instances, _indexes
|
||||
|
||||
return _instances
|
||||
|
||||
def __len__(self):
|
||||
|
@ -142,41 +197,96 @@ class ExtendedLabels:
|
|||
def __getitem__(self, idx):
|
||||
return ExtendedLabels(self.true[idx], self.pred[idx], self.nbcl)
|
||||
|
||||
def split_by_pred(self, _indexes: List[np.ndarray]):
|
||||
_labels = []
|
||||
for cl, ind in enumerate(_indexes):
|
||||
_true, _pred = self.true[ind], self.pred[ind]
|
||||
assert (
|
||||
_pred.shape[0] == 0 or (_pred == _pred[0]).all()
|
||||
), "index is selecting non uniform class"
|
||||
_tfp = self.extpol.true_lbl_from_pred(self.nbcl)(_true, _pred)
|
||||
_labels.append(_tfp)
|
||||
|
||||
return _labels, self.extpol.tfp_classes(self.nbcl)
|
||||
|
||||
|
||||
class ExtendedPrev:
|
||||
def __init__(
|
||||
self,
|
||||
flat: np.ndarray,
|
||||
nbcl: int,
|
||||
q_classes: list,
|
||||
extpol: ExtensionPolicy,
|
||||
extpol: ExtensionPolicy = None,
|
||||
):
|
||||
self.flat = flat
|
||||
self.nbcl = nbcl
|
||||
self.extpol = ExtensionPolicy() if extpol is None else extpol
|
||||
self.__check_q_classes(q_classes)
|
||||
self._matrix = self.__build_matrix()
|
||||
|
||||
def __check_q_classes(self, q_classes):
|
||||
q_classes = np.array(q_classes)
|
||||
_flat = np.zeros(self.extpol.qclasses(self.nbcl).shape)
|
||||
_flat[q_classes] = self.flat
|
||||
self.flat = _flat
|
||||
# self._matrix = self.__build_matrix()
|
||||
|
||||
def __build_matrix(self):
|
||||
_matrix = np.zeros((self.nbcl, self.nbcl))
|
||||
_matrix[self.extpol.matrix_idx(self.nbcl)] = self.flat
|
||||
return _matrix
|
||||
|
||||
def can_f1(self):
|
||||
return self.extpol.can_f1(self.nbcl)
|
||||
|
||||
@property
|
||||
def A(self):
|
||||
return self._matrix
|
||||
# return self._matrix
|
||||
return self.__build_matrix()
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
return self.extpol.qclasses(self.nbcl)
|
||||
|
||||
|
||||
class ExtMulPrev(ExtendedPrev):
|
||||
def __init__(
|
||||
self,
|
||||
flat: np.ndarray,
|
||||
nbcl: int,
|
||||
q_classes: list = None,
|
||||
extpol: ExtensionPolicy = None,
|
||||
):
|
||||
super().__init__(flat, nbcl, extpol=extpol)
|
||||
self.flat = self.__check_q_classes(q_classes, flat)
|
||||
|
||||
def __check_q_classes(self, q_classes, flat):
|
||||
if q_classes is None:
|
||||
return flat
|
||||
q_classes = np.array(q_classes)
|
||||
_flat = np.zeros(self.extpol.qclasses(self.nbcl).shape)
|
||||
_flat[q_classes] = flat
|
||||
return _flat
|
||||
|
||||
|
||||
class ExtBinPrev(ExtendedPrev):
|
||||
def __init__(
|
||||
self,
|
||||
flat: List[np.ndarray],
|
||||
nbcl: int,
|
||||
q_classes: List[List[int]] = None,
|
||||
extpol: ExtensionPolicy = None,
|
||||
):
|
||||
super().__init__(flat, nbcl, extpol=extpol)
|
||||
flat = self.__check_q_classes(q_classes, flat)
|
||||
self.flat = self.__build_flat(flat)
|
||||
|
||||
def __check_q_classes(self, q_classes, flat):
|
||||
if q_classes is None:
|
||||
return flat
|
||||
_flat = []
|
||||
for fl, qc in zip(flat, q_classes):
|
||||
qc = np.array(qc)
|
||||
_fl = np.zeros(self.extpol.tfp_classes(self.nbcl).shape)
|
||||
_fl[qc] = fl
|
||||
_flat.append(_fl)
|
||||
return np.array(_flat)
|
||||
|
||||
def __build_flat(self, flat):
|
||||
return np.concatenate(flat.T)
|
||||
|
||||
|
||||
class ExtendedCollection(LabelledCollection):
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -233,19 +343,17 @@ class ExtendedCollection(LabelledCollection):
|
|||
def n_classes(self):
|
||||
return len(self.e_labels_.classes)
|
||||
|
||||
def counts(self):
|
||||
_counts = super().counts()
|
||||
if self.extpol.collapse_false:
|
||||
_counts = np.insert(_counts, 2, 0)
|
||||
|
||||
return _counts
|
||||
def e_prevalence(self) -> ExtendedPrev:
|
||||
_prev = self.prevalence()
|
||||
return ExtendedPrev(_prev, self.n_base_classes, extpol=self.extpol)
|
||||
|
||||
def split_by_pred(self):
|
||||
_ncl = self.pred_proba.shape[1]
|
||||
_instances, _indexes = self.e_data_.split_by_pred(return_indexes=True)
|
||||
_labels = [self.ey[ind] for ind in _indexes]
|
||||
_indexes = _split_index_by_pred(self.pred_proba)
|
||||
_instances = self.e_data_.split_by_pred(_indexes)
|
||||
# _labels = [self.ey[ind] for ind in _indexes]
|
||||
_labels, _cls = self.e_labels_.split_by_pred(_indexes)
|
||||
return [
|
||||
LabelledCollection(inst, lbl.true, classes=range(0, _ncl))
|
||||
LabelledCollection(inst, lbl, classes=_cls)
|
||||
for inst, lbl in zip(_instances, _labels)
|
||||
]
|
||||
|
||||
|
|
160
quacc/dataset.py
160
quacc/dataset.py
|
@ -1,3 +1,4 @@
|
|||
import itertools
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
|
@ -119,35 +120,23 @@ class DatasetSample:
|
|||
return {"train": self.train_prev, "validation": self.validation_prev}
|
||||
|
||||
|
||||
class Dataset:
|
||||
def __init__(self, name, n_prevalences=9, prevs=None, target=None):
|
||||
self._name = name
|
||||
self._target = target
|
||||
|
||||
self.prevs = None
|
||||
self.n_prevs = n_prevalences
|
||||
if prevs is not None:
|
||||
prevs = np.unique([p for p in prevs if p > 0.0 and p < 1.0])
|
||||
if prevs.shape[0] > 0:
|
||||
self.prevs = np.sort(prevs)
|
||||
self.n_prevs = self.prevs.shape[0]
|
||||
|
||||
def __spambase(self):
|
||||
class DatasetProvider:
|
||||
def __spambase(self, **kwargs):
|
||||
return qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
|
||||
|
||||
# provare min_df=5
|
||||
def __imdb(self):
|
||||
def __imdb(self, **kwargs):
|
||||
return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
|
||||
|
||||
def __rcv1(self):
|
||||
def __rcv1(self, target, **kwargs):
|
||||
n_train = 23149
|
||||
available_targets = ["CCAT", "GCAT", "MCAT"]
|
||||
|
||||
if self._target is None or self._target not in available_targets:
|
||||
raise ValueError(f"Invalid target {self._target}")
|
||||
if target is None or target not in available_targets:
|
||||
raise ValueError(f"Invalid target {target}")
|
||||
|
||||
dataset = fetch_rcv1()
|
||||
target_index = np.where(dataset.target_names == self._target)[0]
|
||||
target_index = np.where(dataset.target_names == target)[0]
|
||||
all_train_d = dataset.data[:n_train, :]
|
||||
test_d = dataset.data[n_train:, :]
|
||||
labels = dataset.target[:, target_index].toarray().flatten()
|
||||
|
@ -157,14 +146,14 @@ class Dataset:
|
|||
|
||||
return all_train, test
|
||||
|
||||
def __cifar10(self):
|
||||
def __cifar10(self, target, **kwargs):
|
||||
dataset = fetch_cifar10()
|
||||
available_targets: list = dataset.label_names
|
||||
|
||||
if self._target is None or self._target not in available_targets:
|
||||
raise ValueError(f"Invalid target {self._target}")
|
||||
if target is None or self._target not in available_targets:
|
||||
raise ValueError(f"Invalid target {target}")
|
||||
|
||||
target_index = available_targets.index(self._target)
|
||||
target_index = available_targets.index(target)
|
||||
all_train_d = dataset.train.data
|
||||
all_train_l = (dataset.train.labels == target_index).astype(int)
|
||||
test_d = dataset.test.data
|
||||
|
@ -174,14 +163,14 @@ class Dataset:
|
|||
|
||||
return all_train, test
|
||||
|
||||
def __cifar100(self):
|
||||
def __cifar100(self, target, **kwargs):
|
||||
dataset = fetch_cifar100()
|
||||
available_targets: list = dataset.coarse_label_names
|
||||
|
||||
if self._target is None or self._target not in available_targets:
|
||||
raise ValueError(f"Invalid target {self._target}")
|
||||
if target is None or target not in available_targets:
|
||||
raise ValueError(f"Invalid target {target}")
|
||||
|
||||
target_index = available_targets.index(self._target)
|
||||
target_index = available_targets.index(target)
|
||||
all_train_d = dataset.train.data
|
||||
all_train_l = (dataset.train.coarse_labels == target_index).astype(int)
|
||||
test_d = dataset.test.data
|
||||
|
@ -191,68 +180,123 @@ class Dataset:
|
|||
|
||||
return all_train, test
|
||||
|
||||
def __train_test(self) -> Tuple[LabelledCollection, LabelledCollection]:
|
||||
def __twitter_gasp(self, **kwargs):
|
||||
return qp.datasets.fetch_twitter("gasp", min_df=3).train_test
|
||||
|
||||
def alltrain_test(
|
||||
self, name: str, target: str | None
|
||||
) -> Tuple[LabelledCollection, LabelledCollection]:
|
||||
all_train, test = {
|
||||
"spambase": self.__spambase,
|
||||
"imdb": self.__imdb,
|
||||
"rcv1": self.__rcv1,
|
||||
"cifar10": self.__cifar10,
|
||||
"cifar100": self.__cifar100,
|
||||
}[self._name]()
|
||||
"twitter_gasp": self.__twitter_gasp,
|
||||
}[name](target=target)
|
||||
|
||||
return all_train, test
|
||||
|
||||
def get_raw(self) -> DatasetSample:
|
||||
all_train, test = self.__train_test()
|
||||
|
||||
train, val = all_train.split_stratified(
|
||||
train_prop=TRAIN_VAL_PROP, random_state=env._R_SEED
|
||||
)
|
||||
class Dataset(DatasetProvider):
|
||||
def __init__(self, name, n_prevalences=9, prevs=None, target=None):
|
||||
self._name = name
|
||||
self._target = target
|
||||
|
||||
return DatasetSample(train, val, test)
|
||||
self.all_train, self.test = self.alltrain_test(self._name, self._target)
|
||||
self.__resample_all_train()
|
||||
|
||||
def get(self) -> List[DatasetSample]:
|
||||
all_train, test = self.__train_test()
|
||||
self.prevs = None
|
||||
self._n_prevs = n_prevalences
|
||||
self.__check_prevs(prevs)
|
||||
self.prevs = self.__build_prevs()
|
||||
|
||||
# resample all_train set to have (0.5, 0.5) prevalence
|
||||
at_positives = np.sum(all_train.y)
|
||||
all_train = all_train.sampling(
|
||||
min(at_positives, len(all_train) - at_positives) * 2,
|
||||
0.5,
|
||||
def __resample_all_train(self):
|
||||
tr_counts, tr_ncl = self.all_train.counts(), self.all_train.n_classes
|
||||
_resample_prevs = np.full((tr_ncl,), fill_value=1.0 / tr_ncl)
|
||||
self.all_train = self.all_train.sampling(
|
||||
np.min(tr_counts) * tr_ncl,
|
||||
*_resample_prevs.tolist(),
|
||||
random_state=env._R_SEED,
|
||||
)
|
||||
|
||||
# sample prevalences
|
||||
def __check_prevs(self, prevs):
|
||||
try:
|
||||
iter(prevs)
|
||||
except TypeError:
|
||||
return
|
||||
|
||||
if prevs is None or len(prevs) == 0:
|
||||
return
|
||||
|
||||
def is_float_iterable(obj):
|
||||
try:
|
||||
it = iter(obj)
|
||||
return all([isinstance(o, float) for o in it])
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
if not all([is_float_iterable(p) for p in prevs]):
|
||||
return
|
||||
|
||||
if not all([len(p) == self.all_train.n_classes for p in prevs]):
|
||||
return
|
||||
|
||||
if not all([sum(p) == 1.0 for p in prevs]):
|
||||
return
|
||||
|
||||
self.prevs = np.unique(prevs, axis=0)
|
||||
|
||||
def __build_prevs(self):
|
||||
if self.prevs is not None:
|
||||
prevs = self.prevs
|
||||
else:
|
||||
prevs = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:]
|
||||
return self.prevs
|
||||
|
||||
at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevs)
|
||||
datasets = []
|
||||
for p in 1.0 - prevs:
|
||||
all_train_sampled = all_train.sampling(at_size, p, random_state=env._R_SEED)
|
||||
train, validation = all_train_sampled.split_stratified(
|
||||
train_prop=TRAIN_VAL_PROP, random_state=env._R_SEED
|
||||
)
|
||||
datasets.append(DatasetSample(train, validation, test))
|
||||
dim = self.all_train.n_classes
|
||||
lspace = np.linspace(0.0, 1.0, num=self._n_prevs + 1, endpoint=False)[1:]
|
||||
mesh = np.array(np.meshgrid(*[lspace for _ in range(dim)])).T.reshape(-1, dim)
|
||||
mesh = mesh[np.where(mesh.sum(axis=1) == 1.0)]
|
||||
return np.around(np.unique(mesh, axis=0), decimals=4)
|
||||
|
||||
return datasets
|
||||
def __build_sample(
|
||||
self,
|
||||
p: np.ndarray,
|
||||
at_size: int,
|
||||
):
|
||||
all_train_sampled = self.all_train.sampling(
|
||||
at_size, *(p[:-1]), random_state=env._R_SEED
|
||||
)
|
||||
train, validation = all_train_sampled.split_stratified(
|
||||
train_prop=TRAIN_VAL_PROP, random_state=env._R_SEED
|
||||
)
|
||||
return DatasetSample(train, validation, self.test)
|
||||
|
||||
def get(self) -> List[DatasetSample]:
|
||||
at_size = min(
|
||||
math.floor(len(self.all_train) * (1.0 / self.all_train.n_classes) / p)
|
||||
for _prev in self.prevs
|
||||
for p in _prev
|
||||
)
|
||||
|
||||
return [self.__build_sample(p, at_size) for p in self.prevs]
|
||||
|
||||
def __call__(self):
|
||||
return self.get()
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
match (self._name, self.n_prevs):
|
||||
match (self._name, self._n_prevs):
|
||||
case (("rcv1" | "cifar10" | "cifar100"), 9):
|
||||
return f"{self._name}_{self._target}"
|
||||
case (("rcv1" | "cifar10" | "cifar100"), _):
|
||||
return f"{self._name}_{self._target}_{self.n_prevs}prevs"
|
||||
return f"{self._name}_{self._target}_{self._n_prevs}prevs"
|
||||
case (_, 9):
|
||||
return f"{self._name}"
|
||||
case (_, _):
|
||||
return f"{self._name}_{self.n_prevs}prevs"
|
||||
return f"{self._name}_{self._n_prevs}prevs"
|
||||
|
||||
@property
|
||||
def nprevs(self):
|
||||
return self.prevs.shape[0]
|
||||
|
||||
|
||||
# >>> fetch_rcv1().target_names
|
||||
|
|
|
@ -1,4 +1,10 @@
|
|||
from functools import wraps
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import quapy as qp
|
||||
|
||||
from quacc.data import ExtendedPrev
|
||||
|
||||
|
||||
def from_name(err_name):
|
||||
|
@ -21,30 +27,54 @@ def from_name(err_name):
|
|||
# return 2 * (precision * recall) / (precision + recall)
|
||||
|
||||
|
||||
def f1(prev):
|
||||
den = (2 * prev[3]) + prev[1] + prev[2]
|
||||
if den == 0:
|
||||
return 0.0
|
||||
def nae(prevs: np.ndarray, prevs_hat: np.ndarray) -> np.ndarray:
|
||||
_ae = qp.error.ae(prevs, prevs_hat)
|
||||
# _zae = (2.0 * (1.0 - prevs.min())) / prevs.shape[1]
|
||||
_zae = 2.0 / prevs.shape[1]
|
||||
return _ae / _zae
|
||||
|
||||
|
||||
def f1(prev: np.ndarray | ExtendedPrev) -> float:
|
||||
if isinstance(prev, ExtendedPrev):
|
||||
prev = prev.A
|
||||
|
||||
def _score(idx):
|
||||
_tp = prev[idx, idx]
|
||||
_fn = prev[idx, :].sum() - _tp
|
||||
_fp = prev[:, idx].sum() - _tp
|
||||
_den = 2.0 * _tp + _fp + _fn
|
||||
return 0.0 if _den == 0.0 else (2.0 * _tp) / _den
|
||||
|
||||
if prev.shape[0] == 2:
|
||||
return _score(1)
|
||||
else:
|
||||
return (2 * prev[3]) / den
|
||||
_idxs = np.arange(prev.shape[0])
|
||||
return np.array([_score(idx) for idx in _idxs]).mean()
|
||||
|
||||
|
||||
def f1e(prev):
|
||||
return 1 - f1(prev)
|
||||
|
||||
|
||||
def acc(prev: np.ndarray) -> float:
|
||||
return (prev[0] + prev[3]) / np.sum(prev)
|
||||
def acc(prev: np.ndarray | ExtendedPrev) -> float:
|
||||
if isinstance(prev, ExtendedPrev):
|
||||
prev = prev.A
|
||||
return np.diag(prev).sum() / prev.sum()
|
||||
|
||||
|
||||
def accd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> np.ndarray:
|
||||
vacc = np.vectorize(acc, signature="(m)->()")
|
||||
a_tp = vacc(true_prevs)
|
||||
a_ep = vacc(estim_prevs)
|
||||
def accd(
|
||||
true_prevs: List[np.ndarray | ExtendedPrev],
|
||||
estim_prevs: List[np.ndarray | ExtendedPrev],
|
||||
) -> np.ndarray:
|
||||
a_tp = np.array([acc(tp) for tp in true_prevs])
|
||||
a_ep = np.array([acc(ep) for ep in estim_prevs])
|
||||
return np.abs(a_tp - a_ep)
|
||||
|
||||
|
||||
def maccd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> float:
|
||||
def maccd(
|
||||
true_prevs: List[np.ndarray | ExtendedPrev],
|
||||
estim_prevs: List[np.ndarray | ExtendedPrev],
|
||||
) -> float:
|
||||
return accd(true_prevs, estim_prevs).mean()
|
||||
|
||||
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
from typing import Callable, Union
|
||||
|
||||
import numpy as np
|
||||
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
||||
|
||||
import quacc as qc
|
||||
|
||||
from ..method.base import BaseAccuracyEstimator
|
||||
|
||||
|
||||
def evaluate(
|
||||
estimator: BaseAccuracyEstimator,
|
||||
protocol: AbstractProtocol,
|
||||
error_metric: Union[Callable | str],
|
||||
) -> float:
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qc.error.from_name(error_metric)
|
||||
|
||||
collator_bck_ = protocol.collator
|
||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||
|
||||
estim_prevs, true_prevs = [], []
|
||||
for sample in protocol():
|
||||
e_sample = estimator.extend(sample)
|
||||
estim_prev = estimator.estimate(e_sample.eX)
|
||||
estim_prevs.append(estim_prev)
|
||||
true_prevs.append(e_sample.prevalence())
|
||||
|
||||
protocol.collator = collator_bck_
|
||||
|
||||
true_prevs = np.array(true_prevs)
|
||||
estim_prevs = np.array(estim_prevs)
|
||||
|
||||
return error_metric(true_prevs, estim_prevs)
|
|
@ -43,6 +43,7 @@ def kfcv(
|
|||
predict_method="predict",
|
||||
):
|
||||
c_model_predict = getattr(c_model, predict_method)
|
||||
f1_average = "binary" if validation.n_classes == 2 else "macro"
|
||||
|
||||
scoring = ["accuracy", "f1_macro"]
|
||||
scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring)
|
||||
|
@ -53,7 +54,9 @@ def kfcv(
|
|||
for test in protocol():
|
||||
test_preds = c_model_predict(test.X)
|
||||
meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
|
||||
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
|
||||
meta_f1 = abs(
|
||||
f1_score - metrics.f1_score(test.y, test_preds, average=f1_average)
|
||||
)
|
||||
report.append_row(
|
||||
test.prevalence(),
|
||||
acc_score=acc_score,
|
||||
|
@ -72,13 +75,15 @@ def ref(
|
|||
protocol: AbstractStochasticSeededProtocol,
|
||||
):
|
||||
c_model_predict = getattr(c_model, "predict")
|
||||
f1_average = "binary" if validation.n_classes == 2 else "macro"
|
||||
|
||||
report = EvaluationReport(name="ref")
|
||||
for test in protocol():
|
||||
test_preds = c_model_predict(test.X)
|
||||
report.append_row(
|
||||
test.prevalence(),
|
||||
acc_score=metrics.accuracy_score(test.y, test_preds),
|
||||
f1_score=metrics.f1_score(test.y, test_preds),
|
||||
f1_score=metrics.f1_score(test.y, test_preds, average=f1_average),
|
||||
)
|
||||
|
||||
return report
|
||||
|
@ -93,6 +98,7 @@ def atc_mc(
|
|||
):
|
||||
"""garg"""
|
||||
c_model_predict = getattr(c_model, predict_method)
|
||||
f1_average = "binary" if validation.n_classes == 2 else "macro"
|
||||
|
||||
## Load ID validation data probs and labels
|
||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
||||
|
@ -110,8 +116,12 @@ def atc_mc(
|
|||
test_scores = atc.get_max_conf(test_probs)
|
||||
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
|
||||
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
|
||||
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
|
||||
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
|
||||
f1_score = atc.get_ATC_f1(
|
||||
atc_thres, test_scores, test_probs, average=f1_average
|
||||
)
|
||||
meta_f1 = abs(
|
||||
f1_score - metrics.f1_score(test.y, test_preds, average=f1_average)
|
||||
)
|
||||
report.append_row(
|
||||
test.prevalence(),
|
||||
acc=meta_acc,
|
||||
|
@ -132,6 +142,7 @@ def atc_ne(
|
|||
):
|
||||
"""garg"""
|
||||
c_model_predict = getattr(c_model, predict_method)
|
||||
f1_average = "binary" if validation.n_classes == 2 else "macro"
|
||||
|
||||
## Load ID validation data probs and labels
|
||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
||||
|
@ -149,8 +160,12 @@ def atc_ne(
|
|||
test_scores = atc.get_entropy(test_probs)
|
||||
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
|
||||
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
|
||||
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
|
||||
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
|
||||
f1_score = atc.get_ATC_f1(
|
||||
atc_thres, test_scores, test_probs, average=f1_average
|
||||
)
|
||||
meta_f1 = abs(
|
||||
f1_score - metrics.f1_score(test.y, test_preds, average=f1_average)
|
||||
)
|
||||
report.append_row(
|
||||
test.prevalence(),
|
||||
acc=meta_acc,
|
||||
|
@ -170,11 +185,14 @@ def doc(
|
|||
predict_method="predict_proba",
|
||||
):
|
||||
c_model_predict = getattr(c_model, predict_method)
|
||||
f1_average = "binary" if validation.n_classes == 2 else "macro"
|
||||
|
||||
val1, val2 = validation.split_stratified(train_prop=0.5, random_state=env._R_SEED)
|
||||
val1_probs = c_model_predict(val1.X)
|
||||
val1_mc = np.max(val1_probs, axis=-1)
|
||||
val1_preds = np.argmax(val1_probs, axis=-1)
|
||||
val1_acc = metrics.accuracy_score(val1.y, val1_preds)
|
||||
val1_f1 = metrics.f1_score(val1.y, val1_preds, average=f1_average)
|
||||
val2_protocol = APP(
|
||||
val2,
|
||||
n_prevalences=21,
|
||||
|
@ -193,26 +211,44 @@ def doc(
|
|||
val2_prot_y.append(v2.y)
|
||||
|
||||
val_scores = np.array([doclib.get_doc(val1_mc, v2_mc) for v2_mc in val2_prot_mc])
|
||||
val_targets = np.array(
|
||||
val_targets_acc = np.array(
|
||||
[
|
||||
val1_acc - metrics.accuracy_score(v2_y, v2_preds)
|
||||
for v2_y, v2_preds in zip(val2_prot_y, val2_prot_preds)
|
||||
]
|
||||
)
|
||||
reg = LinearRegression().fit(
|
||||
val_scores.reshape((val_scores.shape[0], 1)), val_targets
|
||||
reg_acc = LinearRegression().fit(val_scores[:, np.newaxis], val_targets_acc)
|
||||
val_targets_f1 = np.array(
|
||||
[
|
||||
val1_f1 - metrics.f1_score(v2_y, v2_preds, average=f1_average)
|
||||
for v2_y, v2_preds in zip(val2_prot_y, val2_prot_preds)
|
||||
]
|
||||
)
|
||||
reg_f1 = LinearRegression().fit(val_scores[:, np.newaxis], val_targets_f1)
|
||||
|
||||
report = EvaluationReport(name="doc")
|
||||
for test in protocol():
|
||||
test_probs = c_model_predict(test.X)
|
||||
test_preds = np.argmax(test_probs, axis=-1)
|
||||
test_mc = np.max(test_probs, axis=-1)
|
||||
score = (
|
||||
val1_acc - reg.predict(np.array([[doclib.get_doc(val1_mc, test_mc)]]))[0]
|
||||
acc_score = (
|
||||
val1_acc
|
||||
- reg_acc.predict(np.array([[doclib.get_doc(val1_mc, test_mc)]]))[0]
|
||||
)
|
||||
f1_score = (
|
||||
val1_f1 - reg_f1.predict(np.array([[doclib.get_doc(val1_mc, test_mc)]]))[0]
|
||||
)
|
||||
meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
|
||||
meta_f1 = abs(
|
||||
f1_score - metrics.f1_score(test.y, test_preds, average=f1_average)
|
||||
)
|
||||
report.append_row(
|
||||
test.prevalence(),
|
||||
acc=meta_acc,
|
||||
acc_score=acc_score,
|
||||
f1=meta_f1,
|
||||
f1_score=f1_score,
|
||||
)
|
||||
meta_acc = abs(score - metrics.accuracy_score(test.y, test_preds))
|
||||
report.append_row(test.prevalence(), acc=meta_acc, acc_score=score)
|
||||
|
||||
return report
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ import os
|
|||
import time
|
||||
from traceback import print_exception as traceback
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import quapy as qp
|
||||
from joblib import Parallel, delayed
|
||||
|
@ -76,7 +77,8 @@ def evaluate_comparison(dataset: Dataset, estimators=None) -> DatasetReport:
|
|||
log.info(f"dataset {dataset.name} [pool size: {__pool_size}]")
|
||||
for d in dataset():
|
||||
log.info(
|
||||
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} started"
|
||||
f"Dataset sample {np.around(d.train_prev, decimals=2)} "
|
||||
f"of dataset {dataset.name} started"
|
||||
)
|
||||
par_tasks, seq_tasks = split_tasks(
|
||||
CE.func[estimators],
|
||||
|
@ -93,7 +95,8 @@ def evaluate_comparison(dataset: Dataset, estimators=None) -> DatasetReport:
|
|||
|
||||
g_time = time.time() - tstart
|
||||
log.info(
|
||||
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} finished "
|
||||
f"Dataset sample {np.around(d.train_prev, decimals=2)} "
|
||||
f"of dataset {dataset.name} finished "
|
||||
f"[took {g_time:.4f}s]"
|
||||
)
|
||||
|
||||
|
@ -108,7 +111,8 @@ def evaluate_comparison(dataset: Dataset, estimators=None) -> DatasetReport:
|
|||
|
||||
except Exception as e:
|
||||
log.warning(
|
||||
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. "
|
||||
f"Dataset sample {np.around(d.train_prev, decimals=2)} "
|
||||
f"of dataset {dataset.name} failed. "
|
||||
f"Exception: {e}"
|
||||
)
|
||||
traceback(e)
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
from typing import Callable, Union
|
||||
|
||||
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
||||
|
||||
import quacc as qc
|
||||
from quacc.method.base import BaseAccuracyEstimator
|
||||
|
||||
|
||||
def evaluate(
|
||||
estimator: BaseAccuracyEstimator,
|
||||
protocol: AbstractProtocol,
|
||||
error_metric: Union[Callable | str],
|
||||
) -> float:
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qc.error.from_name(error_metric)
|
||||
|
||||
collator_bck_ = protocol.collator
|
||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||
|
||||
estim_prevs, true_prevs = [], []
|
||||
for sample in protocol():
|
||||
e_sample = estimator.extend(sample)
|
||||
estim_prev = estimator.estimate(e_sample.eX)
|
||||
estim_prevs.append(estim_prev)
|
||||
true_prevs.append(e_sample.e_prevalence())
|
||||
|
||||
protocol.collator = collator_bck_
|
||||
|
||||
# true_prevs = np.array(true_prevs)
|
||||
# estim_prevs = np.array(estim_prevs)
|
||||
|
||||
return error_metric(true_prevs, estim_prevs)
|
|
@ -1,11 +1,12 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
from typing import Callable, List, Union
|
||||
|
||||
import numpy as np
|
||||
from matplotlib.pylab import rand
|
||||
from quapy.method.aggregative import PACC, SLD, BaseQuantifier
|
||||
from quapy.protocol import UPP, AbstractProtocol
|
||||
from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.svm import SVC, LinearSVC
|
||||
|
||||
import quacc as qc
|
||||
from quacc.environment import env
|
||||
|
@ -13,34 +14,51 @@ from quacc.evaluation.report import EvaluationReport
|
|||
from quacc.method.base import BQAE, MCAE, BaseAccuracyEstimator
|
||||
from quacc.method.model_selection import (
|
||||
GridSearchAE,
|
||||
HalvingSearchAE,
|
||||
RandomizedSearchAE,
|
||||
SpiderSearchAE,
|
||||
)
|
||||
from quacc.quantification import KDEy
|
||||
|
||||
_param_grid = {
|
||||
"sld": {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"q__recalib": [None, "bcts"],
|
||||
# "q__recalib": [None],
|
||||
"confidence": [None, ["isoft"], ["max_conf", "entropy"]],
|
||||
},
|
||||
"pacc": {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"confidence": [None, ["isoft"], ["max_conf", "entropy"]],
|
||||
},
|
||||
"kde": {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
# "q__classifier__class_weight": [None],
|
||||
"q__bandwidth": np.linspace(0.01, 0.2, 20),
|
||||
"confidence": [None, ["isoft"]],
|
||||
# "confidence": [None],
|
||||
},
|
||||
}
|
||||
|
||||
def _param_grid(method, X_fit: np.ndarray):
|
||||
match method:
|
||||
case "sld_lr":
|
||||
return {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"q__recalib": [None, "bcts"],
|
||||
"confidence": [None, ["isoft"], ["max_conf", "entropy"]],
|
||||
}
|
||||
case "sld_rbf":
|
||||
_scale = 1.0 / (X_fit.shape[1] * X_fit.var())
|
||||
return {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"q__classifier__gamma": _scale * np.logspace(-2, 2, 5),
|
||||
"q__recalib": [None, "bcts"],
|
||||
"confidence": [None, ["isoft"], ["max_conf", "entropy"]],
|
||||
}
|
||||
case "pacc":
|
||||
return {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"confidence": [None, ["isoft"], ["max_conf", "entropy"]],
|
||||
}
|
||||
case "kde_lr":
|
||||
return {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"q__bandwidth": np.linspace(0.01, 0.2, 20),
|
||||
"confidence": [None, ["isoft"]],
|
||||
}
|
||||
case "kde_rbf":
|
||||
_scale = 1.0 / (X_fit.shape[1] * X_fit.var())
|
||||
return {
|
||||
"q__classifier__C": np.logspace(-3, 3, 7),
|
||||
"q__classifier__class_weight": [None, "balanced"],
|
||||
"q__classifier__gamma": _scale * np.logspace(-2, 2, 5),
|
||||
"q__bandwidth": np.linspace(0.01, 0.2, 20),
|
||||
"confidence": [None, ["isoft"]],
|
||||
}
|
||||
|
||||
|
||||
def evaluation_report(
|
||||
|
@ -52,15 +70,19 @@ def evaluation_report(
|
|||
try:
|
||||
e_sample = estimator.extend(sample)
|
||||
estim_prev = estimator.estimate(e_sample.eX)
|
||||
true_prev = e_sample.e_prevalence()
|
||||
acc_score = qc.error.acc(estim_prev)
|
||||
f1_score = qc.error.f1(estim_prev)
|
||||
report.append_row(
|
||||
sample.prevalence(),
|
||||
row = dict(
|
||||
acc_score=acc_score,
|
||||
acc=abs(qc.error.acc(e_sample.prevalence()) - acc_score),
|
||||
f1_score=f1_score,
|
||||
f1=abs(qc.error.f1(e_sample.prevalence()) - f1_score),
|
||||
acc=abs(qc.error.acc(true_prev) - acc_score),
|
||||
)
|
||||
if estim_prev.can_f1():
|
||||
f1_score = qc.error.f1(estim_prev)
|
||||
row = row | dict(
|
||||
f1_score=f1_score,
|
||||
f1=abs(qc.error.f1(true_prev) - f1_score),
|
||||
)
|
||||
report.append_row(sample.prevalence(), **row)
|
||||
except Exception as e:
|
||||
print(f"sample prediction failed for method {method_name}: {e}")
|
||||
report.append_row(
|
||||
|
@ -80,7 +102,9 @@ class EvaluationMethod:
|
|||
q: BaseQuantifier
|
||||
est_n: str
|
||||
conf: List[str] | str = None
|
||||
cf: bool = False
|
||||
cf: bool = False # collapse_false
|
||||
gf: bool = False # group_false
|
||||
d: bool = False # dense
|
||||
|
||||
def get_est(self, c_model):
|
||||
match self.est_n:
|
||||
|
@ -90,9 +114,17 @@ class EvaluationMethod:
|
|||
self.q,
|
||||
confidence=self.conf,
|
||||
collapse_false=self.cf,
|
||||
group_false=self.gf,
|
||||
dense=self.d,
|
||||
)
|
||||
case "bin":
|
||||
return BQAE(c_model, self.q, confidence=self.conf)
|
||||
return BQAE(
|
||||
c_model,
|
||||
self.q,
|
||||
confidence=self.conf,
|
||||
group_false=self.gf,
|
||||
dense=self.d,
|
||||
)
|
||||
|
||||
def __call__(self, c_model, validation, protocol) -> EvaluationReport:
|
||||
est = self.get_est(c_model).fit(validation)
|
||||
|
@ -109,22 +141,26 @@ class EvaluationMethodGridSearch(EvaluationMethod):
|
|||
def get_search(self):
|
||||
match self.search:
|
||||
case "grid":
|
||||
return GridSearchAE
|
||||
case "spider":
|
||||
return SpiderSearchAE
|
||||
return (GridSearchAE, {})
|
||||
case "spider" | "spider2":
|
||||
return (SpiderSearchAE, dict(best_width=2))
|
||||
case "spider3":
|
||||
return (SpiderSearchAE, dict(best_width=3))
|
||||
case _:
|
||||
return GridSearchAE
|
||||
|
||||
def __call__(self, c_model, validation, protocol) -> EvaluationReport:
|
||||
v_train, v_val = validation.split_stratified(0.6, random_state=env._R_SEED)
|
||||
__grid = _param_grid.get(self.pg, {})
|
||||
_search_class = self.get_search()
|
||||
_model = self.get_est(c_model)
|
||||
_grid = _param_grid(self.pg, X_fit=_model.extend(v_train, prefit=True).X)
|
||||
_search_class, _search_params = self.get_search()
|
||||
est = _search_class(
|
||||
model=self.get_est(c_model),
|
||||
param_grid=__grid,
|
||||
model=_model,
|
||||
param_grid=_grid,
|
||||
refit=False,
|
||||
protocol=UPP(v_val, repeats=100),
|
||||
verbose=False,
|
||||
**_search_params,
|
||||
).fit(v_train)
|
||||
return evaluation_report(
|
||||
estimator=est,
|
||||
|
@ -141,10 +177,18 @@ def __sld_lr():
|
|||
return SLD(LogisticRegression())
|
||||
|
||||
|
||||
def __sld_rbf():
|
||||
return SLD(SVC(kernel="rbf", probability=True))
|
||||
|
||||
|
||||
def __kde_lr():
|
||||
return KDEy(LogisticRegression(), random_state=env._R_SEED)
|
||||
|
||||
|
||||
def __kde_rbf():
|
||||
return KDEy(SVC(kernel="rbf", probability=True), random_state=env._R_SEED)
|
||||
|
||||
|
||||
def __sld_lsvc():
|
||||
return SLD(LinearSVC())
|
||||
|
||||
|
@ -154,57 +198,212 @@ def __pacc_lr():
|
|||
|
||||
|
||||
# fmt: off
|
||||
__methods_set = [
|
||||
# base sld
|
||||
M("bin_sld", __sld_lr(), "bin" ),
|
||||
M("mul_sld", __sld_lr(), "mul" ),
|
||||
M("m3w_sld", __sld_lr(), "mul", cf=True),
|
||||
# max_conf + entropy sld
|
||||
M("binc_sld", __sld_lr(), "bin", conf=["max_conf", "entropy"] ),
|
||||
M("mulc_sld", __sld_lr(), "mul", conf=["max_conf", "entropy"] ),
|
||||
M("m3wc_sld", __sld_lr(), "mul", conf=["max_conf", "entropy"], cf=True),
|
||||
# max_conf sld
|
||||
M("binmc_sld", __sld_lr(), "bin", conf="max_conf", ),
|
||||
M("mulmc_sld", __sld_lr(), "mul", conf="max_conf", ),
|
||||
M("m3wmc_sld", __sld_lr(), "mul", conf="max_conf", cf=True),
|
||||
# entropy sld
|
||||
M("binne_sld", __sld_lr(), "bin", conf="entropy", ),
|
||||
M("mulne_sld", __sld_lr(), "mul", conf="entropy", ),
|
||||
M("m3wne_sld", __sld_lr(), "mul", conf="entropy", cf=True),
|
||||
# inverse softmax sld
|
||||
M("binis_sld", __sld_lr(), "bin", conf="isoft", ),
|
||||
M("mulis_sld", __sld_lr(), "mul", conf="isoft", ),
|
||||
M("m3wis_sld", __sld_lr(), "mul", conf="isoft", cf=True),
|
||||
# gs sld
|
||||
G("bin_sld_gs", __sld_lr(), "bin", pg="sld" ),
|
||||
G("mul_sld_gs", __sld_lr(), "mul", pg="sld" ),
|
||||
G("m3w_sld_gs", __sld_lr(), "mul", pg="sld", cf=True),
|
||||
|
||||
# base kde
|
||||
M("bin_kde", __kde_lr(), "bin" ),
|
||||
M("mul_kde", __kde_lr(), "mul" ),
|
||||
M("m3w_kde", __kde_lr(), "mul", cf=True),
|
||||
# max_conf + entropy kde
|
||||
M("binc_kde", __kde_lr(), "bin", conf=["max_conf", "entropy"] ),
|
||||
M("mulc_kde", __kde_lr(), "mul", conf=["max_conf", "entropy"] ),
|
||||
M("m3wc_kde", __kde_lr(), "mul", conf=["max_conf", "entropy"], cf=True),
|
||||
# max_conf kde
|
||||
M("binmc_kde", __kde_lr(), "bin", conf="max_conf", ),
|
||||
M("mulmc_kde", __kde_lr(), "mul", conf="max_conf", ),
|
||||
M("m3wmc_kde", __kde_lr(), "mul", conf="max_conf", cf=True),
|
||||
# entropy kde
|
||||
M("binne_kde", __kde_lr(), "bin", conf="entropy", ),
|
||||
M("mulne_kde", __kde_lr(), "mul", conf="entropy", ),
|
||||
M("m3wne_kde", __kde_lr(), "mul", conf="entropy", cf=True),
|
||||
# inverse softmax kde
|
||||
M("binis_kde", __kde_lr(), "bin", conf="isoft", ),
|
||||
M("mulis_kde", __kde_lr(), "mul", conf="isoft", ),
|
||||
M("m3wis_kde", __kde_lr(), "mul", conf="isoft", cf=True),
|
||||
# gs kde
|
||||
G("bin_kde_gs", __kde_lr(), "bin", pg="kde", search="spider" ),
|
||||
G("mul_kde_gs", __kde_lr(), "mul", pg="kde", search="spider" ),
|
||||
G("m3w_kde_gs", __kde_lr(), "mul", pg="kde", search="spider", cf=True),
|
||||
__sld_lr_set = [
|
||||
M("bin_sld_lr", __sld_lr(), "bin" ),
|
||||
M("bgf_sld_lr", __sld_lr(), "bin", gf=True),
|
||||
M("mul_sld_lr", __sld_lr(), "mul" ),
|
||||
M("m3w_sld_lr", __sld_lr(), "mul", cf=True),
|
||||
M("mgf_sld_lr", __sld_lr(), "mul", gf=True),
|
||||
# max_conf + entropy sld
|
||||
M("bin_sld_lr_c", __sld_lr(), "bin", conf=["max_conf", "entropy"] ),
|
||||
M("bgf_sld_lr_c", __sld_lr(), "bin", conf=["max_conf", "entropy"], gf=True),
|
||||
M("mul_sld_lr_c", __sld_lr(), "mul", conf=["max_conf", "entropy"] ),
|
||||
M("m3w_sld_lr_c", __sld_lr(), "mul", conf=["max_conf", "entropy"], cf=True),
|
||||
M("mgf_sld_lr_c", __sld_lr(), "mul", conf=["max_conf", "entropy"], gf=True),
|
||||
# max_conf sld
|
||||
M("bin_sld_lr_mc", __sld_lr(), "bin", conf="max_conf", ),
|
||||
M("bgf_sld_lr_mc", __sld_lr(), "bin", conf="max_conf", gf=True),
|
||||
M("mul_sld_lr_mc", __sld_lr(), "mul", conf="max_conf", ),
|
||||
M("m3w_sld_lr_mc", __sld_lr(), "mul", conf="max_conf", cf=True),
|
||||
M("mgf_sld_lr_mc", __sld_lr(), "mul", conf="max_conf", gf=True),
|
||||
# entropy sld
|
||||
M("bin_sld_lr_ne", __sld_lr(), "bin", conf="entropy", ),
|
||||
M("bgf_sld_lr_ne", __sld_lr(), "bin", conf="entropy", gf=True),
|
||||
M("mul_sld_lr_ne", __sld_lr(), "mul", conf="entropy", ),
|
||||
M("m3w_sld_lr_ne", __sld_lr(), "mul", conf="entropy", cf=True),
|
||||
M("mgf_sld_lr_ne", __sld_lr(), "mul", conf="entropy", gf=True),
|
||||
# inverse softmax sld
|
||||
M("bin_sld_lr_is", __sld_lr(), "bin", conf="isoft", ),
|
||||
M("bgf_sld_lr_is", __sld_lr(), "bin", conf="isoft", gf=True),
|
||||
M("mul_sld_lr_is", __sld_lr(), "mul", conf="isoft", ),
|
||||
M("m3w_sld_lr_is", __sld_lr(), "mul", conf="isoft", cf=True),
|
||||
M("mgf_sld_lr_is", __sld_lr(), "mul", conf="isoft", gf=True),
|
||||
# gs sld
|
||||
G("bin_sld_lr_gs", __sld_lr(), "bin", pg="sld_lr" ),
|
||||
G("bgf_sld_lr_gs", __sld_lr(), "bin", pg="sld_lr", gf=True),
|
||||
G("mul_sld_lr_gs", __sld_lr(), "mul", pg="sld_lr" ),
|
||||
G("m3w_sld_lr_gs", __sld_lr(), "mul", pg="sld_lr", cf=True),
|
||||
G("mgf_sld_lr_gs", __sld_lr(), "mul", pg="sld_lr", gf=True),
|
||||
]
|
||||
|
||||
__dense_sld_lr_set = [
|
||||
M("d_bin_sld_lr", __sld_lr(), "bin", d=True, ),
|
||||
M("d_bgf_sld_lr", __sld_lr(), "bin", d=True, gf=True),
|
||||
M("d_mul_sld_lr", __sld_lr(), "mul", d=True, ),
|
||||
M("d_m3w_sld_lr", __sld_lr(), "mul", d=True, cf=True),
|
||||
M("d_mgf_sld_lr", __sld_lr(), "mul", d=True, gf=True),
|
||||
# max_conf + entropy sld
|
||||
M("d_bin_sld_lr_c", __sld_lr(), "bin", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_bgf_sld_lr_c", __sld_lr(), "bin", d=True, conf=["max_conf", "entropy"], gf=True),
|
||||
M("d_mul_sld_lr_c", __sld_lr(), "mul", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_m3w_sld_lr_c", __sld_lr(), "mul", d=True, conf=["max_conf", "entropy"], cf=True),
|
||||
M("d_mgf_sld_lr_c", __sld_lr(), "mul", d=True, conf=["max_conf", "entropy"], gf=True),
|
||||
# max_conf sld
|
||||
M("d_bin_sld_lr_mc", __sld_lr(), "bin", d=True, conf="max_conf", ),
|
||||
M("d_bgf_sld_lr_mc", __sld_lr(), "bin", d=True, conf="max_conf", gf=True),
|
||||
M("d_mul_sld_lr_mc", __sld_lr(), "mul", d=True, conf="max_conf", ),
|
||||
M("d_m3w_sld_lr_mc", __sld_lr(), "mul", d=True, conf="max_conf", cf=True),
|
||||
M("d_mgf_sld_lr_mc", __sld_lr(), "mul", d=True, conf="max_conf", gf=True),
|
||||
# entropy sld
|
||||
M("d_bin_sld_lr_ne", __sld_lr(), "bin", d=True, conf="entropy", ),
|
||||
M("d_bgf_sld_lr_ne", __sld_lr(), "bin", d=True, conf="entropy", gf=True),
|
||||
M("d_mul_sld_lr_ne", __sld_lr(), "mul", d=True, conf="entropy", ),
|
||||
M("d_m3w_sld_lr_ne", __sld_lr(), "mul", d=True, conf="entropy", cf=True),
|
||||
M("d_mgf_sld_lr_ne", __sld_lr(), "mul", d=True, conf="entropy", gf=True),
|
||||
# inverse softmax sld
|
||||
M("d_bin_sld_lr_is", __sld_lr(), "bin", d=True, conf="isoft", ),
|
||||
M("d_bgf_sld_lr_is", __sld_lr(), "bin", d=True, conf="isoft", gf=True),
|
||||
M("d_mul_sld_lr_is", __sld_lr(), "mul", d=True, conf="isoft", ),
|
||||
M("d_m3w_sld_lr_is", __sld_lr(), "mul", d=True, conf="isoft", cf=True),
|
||||
M("d_mgf_sld_lr_is", __sld_lr(), "mul", d=True, conf="isoft", gf=True),
|
||||
# gs sld
|
||||
G("d_bin_sld_lr_gs", __sld_lr(), "bin", d=True, pg="sld_lr" ),
|
||||
G("d_bgf_sld_lr_gs", __sld_lr(), "bin", d=True, pg="sld_lr", gf=True),
|
||||
G("d_mul_sld_lr_gs", __sld_lr(), "mul", d=True, pg="sld_lr" ),
|
||||
G("d_m3w_sld_lr_gs", __sld_lr(), "mul", d=True, pg="sld_lr", cf=True),
|
||||
G("d_mgf_sld_lr_gs", __sld_lr(), "mul", d=True, pg="sld_lr", gf=True),
|
||||
]
|
||||
|
||||
__dense_sld_rbf_set = [
|
||||
M("d_bin_sld_rbf", __sld_rbf(), "bin", d=True, ),
|
||||
M("d_bgf_sld_rbf", __sld_rbf(), "bin", d=True, gf=True),
|
||||
M("d_mul_sld_rbf", __sld_rbf(), "mul", d=True, ),
|
||||
M("d_m3w_sld_rbf", __sld_rbf(), "mul", d=True, cf=True),
|
||||
M("d_mgf_sld_rbf", __sld_rbf(), "mul", d=True, gf=True),
|
||||
# max_conf + entropy sld
|
||||
M("d_bin_sld_rbf_c", __sld_rbf(), "bin", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_bgf_sld_rbf_c", __sld_rbf(), "bin", d=True, conf=["max_conf", "entropy"], gf=True),
|
||||
M("d_mul_sld_rbf_c", __sld_rbf(), "mul", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_m3w_sld_rbf_c", __sld_rbf(), "mul", d=True, conf=["max_conf", "entropy"], cf=True),
|
||||
M("d_mgf_sld_rbf_c", __sld_rbf(), "mul", d=True, conf=["max_conf", "entropy"], gf=True),
|
||||
# max_conf sld
|
||||
M("d_bin_sld_rbf_mc", __sld_rbf(), "bin", d=True, conf="max_conf", ),
|
||||
M("d_bgf_sld_rbf_mc", __sld_rbf(), "bin", d=True, conf="max_conf", gf=True),
|
||||
M("d_mul_sld_rbf_mc", __sld_rbf(), "mul", d=True, conf="max_conf", ),
|
||||
M("d_m3w_sld_rbf_mc", __sld_rbf(), "mul", d=True, conf="max_conf", cf=True),
|
||||
M("d_mgf_sld_rbf_mc", __sld_rbf(), "mul", d=True, conf="max_conf", gf=True),
|
||||
# entropy sld
|
||||
M("d_bin_sld_rbf_ne", __sld_rbf(), "bin", d=True, conf="entropy", ),
|
||||
M("d_bgf_sld_rbf_ne", __sld_rbf(), "bin", d=True, conf="entropy", gf=True),
|
||||
M("d_mul_sld_rbf_ne", __sld_rbf(), "mul", d=True, conf="entropy", ),
|
||||
M("d_m3w_sld_rbf_ne", __sld_rbf(), "mul", d=True, conf="entropy", cf=True),
|
||||
M("d_mgf_sld_rbf_ne", __sld_rbf(), "mul", d=True, conf="entropy", gf=True),
|
||||
# inverse softmax sld
|
||||
M("d_bin_sld_rbf_is", __sld_rbf(), "bin", d=True, conf="isoft", ),
|
||||
M("d_bgf_sld_rbf_is", __sld_rbf(), "bin", d=True, conf="isoft", gf=True),
|
||||
M("d_mul_sld_rbf_is", __sld_rbf(), "mul", d=True, conf="isoft", ),
|
||||
M("d_m3w_sld_rbf_is", __sld_rbf(), "mul", d=True, conf="isoft", cf=True),
|
||||
M("d_mgf_sld_rbf_is", __sld_rbf(), "mul", d=True, conf="isoft", gf=True),
|
||||
# gs sld
|
||||
G("d_bin_sld_rbf_gs", __sld_rbf(), "bin", d=True, pg="sld_rbf", search="spider", ),
|
||||
G("d_bgf_sld_rbf_gs", __sld_rbf(), "bin", d=True, pg="sld_rbf", search="spider", gf=True),
|
||||
G("d_mul_sld_rbf_gs", __sld_rbf(), "mul", d=True, pg="sld_rbf", search="spider", ),
|
||||
G("d_m3w_sld_rbf_gs", __sld_rbf(), "mul", d=True, pg="sld_rbf", search="spider", cf=True),
|
||||
G("d_mgf_sld_rbf_gs", __sld_rbf(), "mul", d=True, pg="sld_rbf", search="spider", gf=True),
|
||||
]
|
||||
|
||||
__kde_lr_set = [
|
||||
# base kde
|
||||
M("bin_kde_lr", __kde_lr(), "bin" ),
|
||||
M("mul_kde_lr", __kde_lr(), "mul" ),
|
||||
M("m3w_kde_lr", __kde_lr(), "mul", cf=True),
|
||||
# max_conf + entropy kde
|
||||
M("bin_kde_lr_c", __kde_lr(), "bin", conf=["max_conf", "entropy"] ),
|
||||
M("mul_kde_lr_c", __kde_lr(), "mul", conf=["max_conf", "entropy"] ),
|
||||
M("m3w_kde_lr_c", __kde_lr(), "mul", conf=["max_conf", "entropy"], cf=True),
|
||||
# max_conf kde
|
||||
M("bin_kde_lr_mc", __kde_lr(), "bin", conf="max_conf", ),
|
||||
M("mul_kde_lr_mc", __kde_lr(), "mul", conf="max_conf", ),
|
||||
M("m3w_kde_lr_mc", __kde_lr(), "mul", conf="max_conf", cf=True),
|
||||
# entropy kde
|
||||
M("bin_kde_lr_ne", __kde_lr(), "bin", conf="entropy", ),
|
||||
M("mul_kde_lr_ne", __kde_lr(), "mul", conf="entropy", ),
|
||||
M("m3w_kde_lr_ne", __kde_lr(), "mul", conf="entropy", cf=True),
|
||||
# inverse softmax kde
|
||||
M("bin_kde_lr_is", __kde_lr(), "bin", conf="isoft", ),
|
||||
M("mul_kde_lr_is", __kde_lr(), "mul", conf="isoft", ),
|
||||
M("m3w_kde_lr_is", __kde_lr(), "mul", conf="isoft", cf=True),
|
||||
# gs kde
|
||||
G("bin_kde_lr_gs", __kde_lr(), "bin", pg="kde_lr", search="spider" ),
|
||||
G("mul_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider" ),
|
||||
G("m3w_kde_lr_gs", __kde_lr(), "mul", pg="kde_lr", search="spider", cf=True),
|
||||
]
|
||||
|
||||
__dense_kde_lr_set = [
|
||||
# base kde
|
||||
M("d_bin_kde_lr", __kde_lr(), "bin", d=True, ),
|
||||
M("d_mul_kde_lr", __kde_lr(), "mul", d=True, ),
|
||||
M("d_m3w_kde_lr", __kde_lr(), "mul", d=True, cf=True),
|
||||
# max_conf + entropy kde
|
||||
M("d_bin_kde_lr_c", __kde_lr(), "bin", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_mul_kde_lr_c", __kde_lr(), "mul", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_m3w_kde_lr_c", __kde_lr(), "mul", d=True, conf=["max_conf", "entropy"], cf=True),
|
||||
# max_conf kde
|
||||
M("d_bin_kde_lr_mc", __kde_lr(), "bin", d=True, conf="max_conf", ),
|
||||
M("d_mul_kde_lr_mc", __kde_lr(), "mul", d=True, conf="max_conf", ),
|
||||
M("d_m3w_kde_lr_mc", __kde_lr(), "mul", d=True, conf="max_conf", cf=True),
|
||||
# entropy kde
|
||||
M("d_bin_kde_lr_ne", __kde_lr(), "bin", d=True, conf="entropy", ),
|
||||
M("d_mul_kde_lr_ne", __kde_lr(), "mul", d=True, conf="entropy", ),
|
||||
M("d_m3w_kde_lr_ne", __kde_lr(), "mul", d=True, conf="entropy", cf=True),
|
||||
# inverse softmax kde d=True,
|
||||
M("d_bin_kde_lr_is", __kde_lr(), "bin", d=True, conf="isoft", ),
|
||||
M("d_mul_kde_lr_is", __kde_lr(), "mul", d=True, conf="isoft", ),
|
||||
M("d_m3w_kde_lr_is", __kde_lr(), "mul", d=True, conf="isoft", cf=True),
|
||||
# gs kde
|
||||
G("d_bin_kde_lr_gs", __kde_lr(), "bin", d=True, pg="kde_lr", search="spider" ),
|
||||
G("d_mul_kde_lr_gs", __kde_lr(), "mul", d=True, pg="kde_lr", search="spider" ),
|
||||
G("d_m3w_kde_lr_gs", __kde_lr(), "mul", d=True, pg="kde_lr", search="spider", cf=True),
|
||||
]
|
||||
|
||||
__dense_kde_rbf_set = [
|
||||
# base kde
|
||||
M("d_bin_kde_rbf", __kde_rbf(), "bin", d=True, ),
|
||||
M("d_mul_kde_rbf", __kde_rbf(), "mul", d=True, ),
|
||||
M("d_m3w_kde_rbf", __kde_rbf(), "mul", d=True, cf=True),
|
||||
# max_conf + entropy kde
|
||||
M("d_bin_kde_rbf_c", __kde_rbf(), "bin", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_mul_kde_rbf_c", __kde_rbf(), "mul", d=True, conf=["max_conf", "entropy"] ),
|
||||
M("d_m3w_kde_rbf_c", __kde_rbf(), "mul", d=True, conf=["max_conf", "entropy"], cf=True),
|
||||
# max_conf kde
|
||||
M("d_bin_kde_rbf_mc", __kde_rbf(), "bin", d=True, conf="max_conf", ),
|
||||
M("d_mul_kde_rbf_mc", __kde_rbf(), "mul", d=True, conf="max_conf", ),
|
||||
M("d_m3w_kde_rbf_mc", __kde_rbf(), "mul", d=True, conf="max_conf", cf=True),
|
||||
# entropy kde
|
||||
M("d_bin_kde_rbf_ne", __kde_rbf(), "bin", d=True, conf="entropy", ),
|
||||
M("d_mul_kde_rbf_ne", __kde_rbf(), "mul", d=True, conf="entropy", ),
|
||||
M("d_m3w_kde_rbf_ne", __kde_rbf(), "mul", d=True, conf="entropy", cf=True),
|
||||
# inverse softmax kde
|
||||
M("d_bin_kde_rbf_is", __kde_rbf(), "bin", d=True, conf="isoft", ),
|
||||
M("d_mul_kde_rbf_is", __kde_rbf(), "mul", d=True, conf="isoft", ),
|
||||
M("d_m3w_kde_rbf_is", __kde_rbf(), "mul", d=True, conf="isoft", cf=True),
|
||||
# gs kde
|
||||
G("d_bin_kde_rbf_gs", __kde_rbf(), "bin", d=True, pg="kde_rbf", search="spider" ),
|
||||
G("d_mul_kde_rbf_gs", __kde_rbf(), "mul", d=True, pg="kde_rbf", search="spider" ),
|
||||
G("d_m3w_kde_rbf_gs", __kde_rbf(), "mul", d=True, pg="kde_rbf", search="spider", cf=True),
|
||||
]
|
||||
|
||||
|
||||
# fmt: on
|
||||
|
||||
__methods_set = (
|
||||
__sld_lr_set
|
||||
+ __dense_sld_lr_set
|
||||
+ __dense_sld_rbf_set
|
||||
+ __kde_lr_set
|
||||
+ __dense_kde_lr_set
|
||||
+ __dense_kde_rbf_set
|
||||
)
|
||||
|
||||
_methods = {m.name: m for m in __methods_set}
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
import json
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import quacc as qc
|
||||
import quacc.plot as plot
|
||||
from quacc.utils import fmt_line_md
|
||||
|
||||
|
@ -22,14 +25,24 @@ def _get_estimators(estimators: List[str], cols: np.ndarray):
|
|||
return estimators[np.isin(estimators, cols)]
|
||||
|
||||
|
||||
def _get_shift(index: np.ndarray, train_prev: np.ndarray):
|
||||
index = np.array([np.array(tp) for tp in index])
|
||||
train_prevs = np.tile(train_prev, (index.shape[0], 1))
|
||||
# assert index.shape[1] == train_prev.shape[0], "Mismatch in prevalence shape"
|
||||
# _shift = np.abs(index - train_prev)[:, 1:].sum(axis=1)
|
||||
_shift = qc.error.nae(index, train_prevs)
|
||||
return np.around(_shift, decimals=2)
|
||||
|
||||
|
||||
class EvaluationReport:
|
||||
def __init__(self, name=None):
|
||||
self.data: pd.DataFrame | None = None
|
||||
self.fit_score = None
|
||||
self.name = name if name is not None else "default"
|
||||
self.time = 0.0
|
||||
|
||||
def append_row(self, basep: np.ndarray | Tuple, **row):
|
||||
bp = basep[1]
|
||||
# bp = basep[1]
|
||||
bp = tuple(basep)
|
||||
_keys, _values = zip(*row.items())
|
||||
# _keys = list(row.keys())
|
||||
# _values = list(row.values())
|
||||
|
@ -89,7 +102,7 @@ class CompReport:
|
|||
)
|
||||
.swaplevel(0, 1, axis=1)
|
||||
.sort_index(axis=1, level=0, sort_remaining=False)
|
||||
.sort_index(axis=0, level=0)
|
||||
.sort_index(axis=0, level=0, ascending=False, sort_remaining=False)
|
||||
)
|
||||
|
||||
if times is None:
|
||||
|
@ -97,17 +110,13 @@ class CompReport:
|
|||
else:
|
||||
self.times = times
|
||||
|
||||
self.times["tot"] = g_time
|
||||
self.times["tot"] = g_time if g_time is not None else 0.0
|
||||
self.train_prev = train_prev
|
||||
self.valid_prev = valid_prev
|
||||
|
||||
@property
|
||||
def prevs(self) -> np.ndarray:
|
||||
return np.sort(self._data.index.unique(0))
|
||||
|
||||
@property
|
||||
def np_prevs(self) -> np.ndarray:
|
||||
return np.around([(1.0 - p, p) for p in self.prevs], decimals=2)
|
||||
return self.data().index.unique(0)
|
||||
|
||||
def join(self, other, how="update", estimators=None):
|
||||
if how not in ["update"]:
|
||||
|
@ -160,16 +169,14 @@ class CompReport:
|
|||
def shift_data(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
shift_idx_0 = np.around(
|
||||
np.abs(
|
||||
self._data.index.get_level_values(0).to_numpy() - self.train_prev[1]
|
||||
),
|
||||
decimals=2,
|
||||
shift_idx_0 = _get_shift(
|
||||
self._data.index.get_level_values(0).to_numpy(),
|
||||
self.train_prev,
|
||||
)
|
||||
|
||||
shift_idx_1 = np.empty(shape=shift_idx_0.shape, dtype="<i4")
|
||||
shift_idx_1 = np.zeros(shape=shift_idx_0.shape[0], dtype="<i4")
|
||||
for _id in np.unique(shift_idx_0):
|
||||
_wh = np.where(shift_idx_0 == _id)[0]
|
||||
_wh = (shift_idx_0 == _id).nonzero()[0]
|
||||
shift_idx_1[_wh] = np.arange(_wh.shape[0], dtype="<i4")
|
||||
|
||||
shift_data = self._data.copy()
|
||||
|
@ -191,26 +198,28 @@ class CompReport:
|
|||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_dict = self.data(metric=metric, estimators=estimators)
|
||||
return f_dict.groupby(level=0).mean()
|
||||
return f_dict.groupby(level=0, sort=False).mean()
|
||||
|
||||
def stdev_by_prevs(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_dict = self.data(metric=metric, estimators=estimators)
|
||||
return f_dict.groupby(level=0).std()
|
||||
return f_dict.groupby(level=0, sort=False).std()
|
||||
|
||||
def table(self, metric: str = None, estimators: List[str] = None) -> pd.DataFrame:
|
||||
def train_table(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_data = self.data(metric=metric, estimators=estimators)
|
||||
avg_p = f_data.groupby(level=0).mean()
|
||||
avg_p.loc["avg", :] = f_data.mean()
|
||||
avg_p = f_data.groupby(level=0, sort=False).mean()
|
||||
avg_p.loc["mean", :] = f_data.mean()
|
||||
return avg_p
|
||||
|
||||
def shift_table(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_data = self.shift_data(metric=metric, estimators=estimators)
|
||||
avg_p = f_data.groupby(level=0).mean()
|
||||
avg_p.loc["avg", :] = f_data.mean()
|
||||
avg_p = f_data.groupby(level=0, sort=False).mean()
|
||||
avg_p.loc["mean", :] = f_data.mean()
|
||||
return avg_p
|
||||
|
||||
def get_plots(
|
||||
|
@ -229,7 +238,7 @@ class CompReport:
|
|||
return None
|
||||
|
||||
return plot.plot_delta(
|
||||
base_prevs=self.np_prevs,
|
||||
base_prevs=self.prevs,
|
||||
columns=avg_data.columns.to_numpy(),
|
||||
data=avg_data.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -246,7 +255,7 @@ class CompReport:
|
|||
|
||||
st_data = self.stdev_by_prevs(metric=metric, estimators=estimators)
|
||||
return plot.plot_delta(
|
||||
base_prevs=self.np_prevs,
|
||||
base_prevs=self.prevs,
|
||||
columns=avg_data.columns.to_numpy(),
|
||||
data=avg_data.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -280,12 +289,13 @@ class CompReport:
|
|||
if _shift_data.empty is True:
|
||||
return None
|
||||
|
||||
shift_avg = _shift_data.groupby(level=0).mean()
|
||||
shift_counts = _shift_data.groupby(level=0).count()
|
||||
shift_prevs = np.around(
|
||||
[(1.0 - p, p) for p in np.sort(shift_avg.index.unique(0))],
|
||||
decimals=2,
|
||||
)
|
||||
shift_avg = _shift_data.groupby(level=0, sort=False).mean()
|
||||
shift_counts = _shift_data.groupby(level=0, sort=False).count()
|
||||
shift_prevs = shift_avg.index.unique(0)
|
||||
# shift_prevs = np.around(
|
||||
# [(1.0 - p, p) for p in np.sort(shift_avg.index.unique(0))],
|
||||
# decimals=2,
|
||||
# )
|
||||
return plot.plot_shift(
|
||||
shift_prevs=shift_prevs,
|
||||
columns=shift_avg.columns.to_numpy(),
|
||||
|
@ -317,7 +327,10 @@ class CompReport:
|
|||
res += "\n"
|
||||
if "train_table" in modes:
|
||||
res += "### table\n"
|
||||
res += self.table(metric=metric, estimators=estimators).to_html() + "\n\n"
|
||||
res += (
|
||||
self.train_table(metric=metric, estimators=estimators).to_html()
|
||||
+ "\n\n"
|
||||
)
|
||||
if "shift_table" in modes:
|
||||
res += "### shift table\n"
|
||||
res += (
|
||||
|
@ -369,7 +382,7 @@ class DatasetReport:
|
|||
|
||||
def data(self, metric: str = None, estimators: List[str] = None) -> pd.DataFrame:
|
||||
def _cr_train_prev(cr: CompReport):
|
||||
return cr.train_prev[1]
|
||||
return tuple(np.around(cr.train_prev, decimals=2))
|
||||
|
||||
def _cr_data(cr: CompReport):
|
||||
return cr.data(metric, estimators)
|
||||
|
@ -381,11 +394,27 @@ class DatasetReport:
|
|||
)
|
||||
_crs_train, _crs_data = zip(*_crs_sorted)
|
||||
|
||||
_data = pd.concat(_crs_data, axis=0, keys=np.around(_crs_train, decimals=2))
|
||||
_data = _data.sort_index(axis=0, level=0)
|
||||
_data: pd.DataFrame = pd.concat(
|
||||
_crs_data,
|
||||
axis=0,
|
||||
keys=_crs_train,
|
||||
)
|
||||
|
||||
# The MultiIndex is recreated to make the outer-most level a tuple and not a
|
||||
# sequence of values
|
||||
_len_tr_idx = len(_crs_train[0])
|
||||
_idx = _data.index.to_list()
|
||||
_idx = pd.MultiIndex.from_tuples(
|
||||
[tuple([midx[:_len_tr_idx]] + list(midx[_len_tr_idx:])) for midx in _idx]
|
||||
)
|
||||
_data.index = _idx
|
||||
|
||||
_data = _data.sort_index(axis=0, level=0, ascending=False, sort_remaining=False)
|
||||
return _data
|
||||
|
||||
def shift_data(self, metric: str = None, estimators: str = None) -> pd.DataFrame:
|
||||
def shift_data(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
_shift_data: pd.DataFrame = pd.concat(
|
||||
sorted(
|
||||
[cr.shift_data(metric, estimators) for cr in self.crs],
|
||||
|
@ -423,6 +452,30 @@ class DatasetReport:
|
|||
self.add(cr)
|
||||
return self
|
||||
|
||||
def train_table(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_data = self.data(metric=metric, estimators=estimators)
|
||||
avg_p = f_data.groupby(level=1, sort=False).mean()
|
||||
avg_p.loc["mean", :] = f_data.mean()
|
||||
return avg_p
|
||||
|
||||
def test_table(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_data = self.data(metric=metric, estimators=estimators)
|
||||
avg_p = f_data.groupby(level=0, sort=False).mean()
|
||||
avg_p.loc["mean", :] = f_data.mean()
|
||||
return avg_p
|
||||
|
||||
def shift_table(
|
||||
self, metric: str = None, estimators: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
f_data = self.shift_data(metric=metric, estimators=estimators)
|
||||
avg_p = f_data.groupby(level=0, sort=False).mean()
|
||||
avg_p.loc["mean", :] = f_data.mean()
|
||||
return avg_p
|
||||
|
||||
def get_plots(
|
||||
self,
|
||||
data=None,
|
||||
|
@ -436,14 +489,15 @@ class DatasetReport:
|
|||
):
|
||||
if mode == "delta_train":
|
||||
_data = self.data(metric, estimators) if data is None else data
|
||||
avg_on_train = _data.groupby(level=1).mean()
|
||||
avg_on_train = _data.groupby(level=1, sort=False).mean()
|
||||
if avg_on_train.empty:
|
||||
return None
|
||||
prevs_on_train = np.sort(avg_on_train.index.unique(0))
|
||||
prevs_on_train = avg_on_train.index.unique(0)
|
||||
return plot.plot_delta(
|
||||
base_prevs=np.around(
|
||||
[(1.0 - p, p) for p in prevs_on_train], decimals=2
|
||||
),
|
||||
# base_prevs=np.around(
|
||||
# [(1.0 - p, p) for p in prevs_on_train], decimals=2
|
||||
# ),
|
||||
base_prevs=prevs_on_train,
|
||||
columns=avg_on_train.columns.to_numpy(),
|
||||
data=avg_on_train.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -456,15 +510,16 @@ class DatasetReport:
|
|||
)
|
||||
elif mode == "stdev_train":
|
||||
_data = self.data(metric, estimators) if data is None else data
|
||||
avg_on_train = _data.groupby(level=1).mean()
|
||||
avg_on_train = _data.groupby(level=1, sort=False).mean()
|
||||
if avg_on_train.empty:
|
||||
return None
|
||||
prevs_on_train = np.sort(avg_on_train.index.unique(0))
|
||||
stdev_on_train = _data.groupby(level=1).std()
|
||||
prevs_on_train = avg_on_train.index.unique(0)
|
||||
stdev_on_train = _data.groupby(level=1, sort=False).std()
|
||||
return plot.plot_delta(
|
||||
base_prevs=np.around(
|
||||
[(1.0 - p, p) for p in prevs_on_train], decimals=2
|
||||
),
|
||||
# base_prevs=np.around(
|
||||
# [(1.0 - p, p) for p in prevs_on_train], decimals=2
|
||||
# ),
|
||||
base_prevs=prevs_on_train,
|
||||
columns=avg_on_train.columns.to_numpy(),
|
||||
data=avg_on_train.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -478,12 +533,13 @@ class DatasetReport:
|
|||
)
|
||||
elif mode == "delta_test":
|
||||
_data = self.data(metric, estimators) if data is None else data
|
||||
avg_on_test = _data.groupby(level=0).mean()
|
||||
avg_on_test = _data.groupby(level=0, sort=False).mean()
|
||||
if avg_on_test.empty:
|
||||
return None
|
||||
prevs_on_test = np.sort(avg_on_test.index.unique(0))
|
||||
prevs_on_test = avg_on_test.index.unique(0)
|
||||
return plot.plot_delta(
|
||||
base_prevs=np.around([(1.0 - p, p) for p in prevs_on_test], decimals=2),
|
||||
# base_prevs=np.around([(1.0 - p, p) for p in prevs_on_test], decimals=2),
|
||||
base_prevs=prevs_on_test,
|
||||
columns=avg_on_test.columns.to_numpy(),
|
||||
data=avg_on_test.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -496,13 +552,14 @@ class DatasetReport:
|
|||
)
|
||||
elif mode == "stdev_test":
|
||||
_data = self.data(metric, estimators) if data is None else data
|
||||
avg_on_test = _data.groupby(level=0).mean()
|
||||
avg_on_test = _data.groupby(level=0, sort=False).mean()
|
||||
if avg_on_test.empty:
|
||||
return None
|
||||
prevs_on_test = np.sort(avg_on_test.index.unique(0))
|
||||
stdev_on_test = _data.groupby(level=0).std()
|
||||
prevs_on_test = avg_on_test.index.unique(0)
|
||||
stdev_on_test = _data.groupby(level=0, sort=False).std()
|
||||
return plot.plot_delta(
|
||||
base_prevs=np.around([(1.0 - p, p) for p in prevs_on_test], decimals=2),
|
||||
# base_prevs=np.around([(1.0 - p, p) for p in prevs_on_test], decimals=2),
|
||||
base_prevs=prevs_on_test,
|
||||
columns=avg_on_test.columns.to_numpy(),
|
||||
data=avg_on_test.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -516,13 +573,14 @@ class DatasetReport:
|
|||
)
|
||||
elif mode == "shift":
|
||||
_shift_data = self.shift_data(metric, estimators) if data is None else data
|
||||
avg_shift = _shift_data.groupby(level=0).mean()
|
||||
avg_shift = _shift_data.groupby(level=0, sort=False).mean()
|
||||
if avg_shift.empty:
|
||||
return None
|
||||
count_shift = _shift_data.groupby(level=0).count()
|
||||
prevs_shift = np.sort(avg_shift.index.unique(0))
|
||||
count_shift = _shift_data.groupby(level=0, sort=False).count()
|
||||
prevs_shift = avg_shift.index.unique(0)
|
||||
return plot.plot_shift(
|
||||
shift_prevs=np.around([(1.0 - p, p) for p in prevs_shift], decimals=2),
|
||||
# shift_prevs=np.around([(1.0 - p, p) for p in prevs_shift], decimals=2),
|
||||
shift_prevs=prevs_shift,
|
||||
columns=avg_shift.columns.to_numpy(),
|
||||
data=avg_shift.T.to_numpy(),
|
||||
metric=metric,
|
||||
|
@ -551,7 +609,14 @@ class DatasetReport:
|
|||
and str(round(cr.train_prev[1] * 100)) not in cr_prevs
|
||||
):
|
||||
continue
|
||||
res += f"{cr.to_md(conf, metric=metric, estimators=estimators, modes=cr_modes, plot_path=plot_path)}\n\n"
|
||||
_md = cr.to_md(
|
||||
conf,
|
||||
metric=metric,
|
||||
estimators=estimators,
|
||||
modes=cr_modes,
|
||||
plot_path=plot_path,
|
||||
)
|
||||
res += f"{_md}\n\n"
|
||||
|
||||
_data = self.data(metric=metric, estimators=estimators)
|
||||
_shift_data = self.shift_data(metric=metric, estimators=estimators)
|
||||
|
@ -562,7 +627,7 @@ class DatasetReport:
|
|||
res += "### avg on train\n"
|
||||
|
||||
if "train_table" in dr_modes:
|
||||
avg_on_train_tbl = _data.groupby(level=1).mean()
|
||||
avg_on_train_tbl = _data.groupby(level=1, sort=False).mean()
|
||||
avg_on_train_tbl.loc["avg", :] = _data.mean()
|
||||
res += avg_on_train_tbl.to_html() + "\n\n"
|
||||
|
||||
|
@ -576,7 +641,8 @@ class DatasetReport:
|
|||
base_path=plot_path,
|
||||
save_fig=True,
|
||||
)
|
||||
res += f"![plot_delta]({delta_op.relative_to(delta_op.parents[1]).as_posix()})\n"
|
||||
_op = delta_op.relative_to(delta_op.parents[1]).as_posix()
|
||||
res += f"![plot_delta]({_op})\n"
|
||||
|
||||
if "stdev_train" in dr_modes:
|
||||
_, delta_stdev_op = self.get_plots(
|
||||
|
@ -588,13 +654,14 @@ class DatasetReport:
|
|||
base_path=plot_path,
|
||||
save_fig=True,
|
||||
)
|
||||
res += f"![plot_delta_stdev]({delta_stdev_op.relative_to(delta_stdev_op.parents[1]).as_posix()})\n"
|
||||
_op = delta_stdev_op.relative_to(delta_stdev_op.parents[1]).as_posix()
|
||||
res += f"![plot_delta_stdev]({_op})\n"
|
||||
|
||||
######################## avg on test ########################
|
||||
res += "### avg on test\n"
|
||||
|
||||
if "test_table" in dr_modes:
|
||||
avg_on_test_tbl = _data.groupby(level=0).mean()
|
||||
avg_on_test_tbl = _data.groupby(level=0, sort=False).mean()
|
||||
avg_on_test_tbl.loc["avg", :] = _data.mean()
|
||||
res += avg_on_test_tbl.to_html() + "\n\n"
|
||||
|
||||
|
@ -608,7 +675,8 @@ class DatasetReport:
|
|||
base_path=plot_path,
|
||||
save_fig=True,
|
||||
)
|
||||
res += f"![plot_delta]({delta_op.relative_to(delta_op.parents[1]).as_posix()})\n"
|
||||
_op = delta_op.relative_to(delta_op.parents[1]).as_posix()
|
||||
res += f"![plot_delta]({_op})\n"
|
||||
|
||||
if "stdev_test" in dr_modes:
|
||||
_, delta_stdev_op = self.get_plots(
|
||||
|
@ -620,13 +688,14 @@ class DatasetReport:
|
|||
base_path=plot_path,
|
||||
save_fig=True,
|
||||
)
|
||||
res += f"![plot_delta_stdev]({delta_stdev_op.relative_to(delta_stdev_op.parents[1]).as_posix()})\n"
|
||||
_op = delta_stdev_op.relative_to(delta_stdev_op.parents[1]).as_posix()
|
||||
res += f"![plot_delta_stdev]({_op})\n"
|
||||
|
||||
######################## avg shift ########################
|
||||
res += "### avg dataset shift\n"
|
||||
|
||||
if "shift_table" in dr_modes:
|
||||
shift_on_train_tbl = _shift_data.groupby(level=0).mean()
|
||||
shift_on_train_tbl = _shift_data.groupby(level=0, sort=False).mean()
|
||||
shift_on_train_tbl.loc["avg", :] = _shift_data.mean()
|
||||
res += shift_on_train_tbl.to_html() + "\n\n"
|
||||
|
||||
|
@ -640,7 +709,8 @@ class DatasetReport:
|
|||
base_path=plot_path,
|
||||
save_fig=True,
|
||||
)
|
||||
res += f"![plot_shift]({shift_op.relative_to(shift_op.parents[1]).as_posix()})\n"
|
||||
_op = shift_op.relative_to(shift_op.parents[1]).as_posix()
|
||||
res += f"![plot_shift]({_op})\n"
|
||||
|
||||
return res
|
||||
|
||||
|
@ -669,7 +739,10 @@ class DatasetReportInfo:
|
|||
self.dr = dr
|
||||
self.name = str(path.parent)
|
||||
_data = dr.data()
|
||||
self.columns = list(_data.columns.unique(1))
|
||||
self.columns = defaultdict(list)
|
||||
for metric, estim in _data.columns:
|
||||
self.columns[estim].append(metric)
|
||||
# self.columns = list(_data.columns.unique(1))
|
||||
self.train_prevs = len(self.dr.crs)
|
||||
self.test_prevs = len(_data.index.unique(1))
|
||||
self.repeats = len(_data.index.unique(2))
|
||||
|
|
|
@ -9,7 +9,14 @@ from quapy.method.aggregative import BaseQuantifier
|
|||
from sklearn.base import BaseEstimator
|
||||
|
||||
import quacc.method.confidence as conf
|
||||
from quacc.data import ExtendedCollection, ExtendedData, ExtensionPolicy
|
||||
from quacc.data import (
|
||||
ExtBinPrev,
|
||||
ExtendedCollection,
|
||||
ExtendedData,
|
||||
ExtendedPrev,
|
||||
ExtensionPolicy,
|
||||
ExtMulPrev,
|
||||
)
|
||||
|
||||
|
||||
class BaseAccuracyEstimator(BaseQuantifier):
|
||||
|
@ -17,10 +24,11 @@ class BaseAccuracyEstimator(BaseQuantifier):
|
|||
self,
|
||||
classifier: BaseEstimator,
|
||||
quantifier: BaseQuantifier,
|
||||
dense=False,
|
||||
):
|
||||
self.__check_classifier(classifier)
|
||||
self.quantifier = quantifier
|
||||
self.extpol = ExtensionPolicy()
|
||||
self.extpol = ExtensionPolicy(dense=dense)
|
||||
|
||||
def __check_classifier(self, classifier):
|
||||
if not hasattr(classifier, "predict_proba"):
|
||||
|
@ -46,9 +54,13 @@ class BaseAccuracyEstimator(BaseQuantifier):
|
|||
...
|
||||
|
||||
@abstractmethod
|
||||
def estimate(self, instances, ext=False) -> np.ndarray:
|
||||
def estimate(self, instances, ext=False) -> ExtendedPrev:
|
||||
...
|
||||
|
||||
@property
|
||||
def dense(self):
|
||||
return self.extpol.dense
|
||||
|
||||
|
||||
class ConfidenceBasedAccuracyEstimator(BaseAccuracyEstimator):
|
||||
def __init__(
|
||||
|
@ -98,10 +110,21 @@ class ConfidenceBasedAccuracyEstimator(BaseAccuracyEstimator):
|
|||
|
||||
return np.concatenate([_conf_ext, _pred_ext], axis=1)
|
||||
|
||||
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
|
||||
def extend(
|
||||
self, coll: LabelledCollection, pred_proba=None, prefit=False
|
||||
) -> ExtendedCollection:
|
||||
if pred_proba is None:
|
||||
pred_proba = self.classifier.predict_proba(coll.X)
|
||||
|
||||
if prefit:
|
||||
self._fit_confidence(coll.X, coll.y, pred_proba)
|
||||
else:
|
||||
if not hasattr(self, "confidence_metrics"):
|
||||
raise AttributeError(
|
||||
"Confidence metrics are not fit and cannot be computed."
|
||||
"Consider setting prefit to True."
|
||||
)
|
||||
|
||||
_ext = self.__get_ext(coll.X, pred_proba)
|
||||
return ExtendedCollection.from_lc(
|
||||
coll, pred_proba=pred_proba, ext=_ext, extpol=self.extpol
|
||||
|
@ -125,17 +148,23 @@ class MultiClassAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
|||
quantifier: BaseQuantifier,
|
||||
confidence: str = None,
|
||||
collapse_false=False,
|
||||
group_false=False,
|
||||
dense=False,
|
||||
):
|
||||
super().__init__(
|
||||
classifier=classifier,
|
||||
quantifier=quantifier,
|
||||
confidence=confidence,
|
||||
)
|
||||
self.extpol = ExtensionPolicy(collapse_false=collapse_false)
|
||||
self.extpol = ExtensionPolicy(
|
||||
collapse_false=collapse_false,
|
||||
group_false=group_false,
|
||||
dense=dense,
|
||||
)
|
||||
self.e_train = None
|
||||
|
||||
def _get_pred_ext(self, pred_proba: np.ndarray):
|
||||
return np.argmax(pred_proba, axis=1, keepdims=True)
|
||||
# def _get_pred_ext(self, pred_proba: np.ndarray):
|
||||
# return np.argmax(pred_proba, axis=1, keepdims=True)
|
||||
|
||||
def fit(self, train: LabelledCollection):
|
||||
pred_proba = self.classifier.predict_proba(train.X)
|
||||
|
@ -148,31 +177,27 @@ class MultiClassAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
|||
|
||||
def estimate(
|
||||
self, instances: ExtendedData | np.ndarray | sp.csr_matrix
|
||||
) -> np.ndarray:
|
||||
) -> ExtendedPrev:
|
||||
e_inst = instances
|
||||
if not isinstance(e_inst, ExtendedData):
|
||||
e_inst = self._extend_instances(instances)
|
||||
|
||||
estim_prev = self.quantifier.quantify(e_inst.X)
|
||||
estim_prev = self._check_prevalence_classes(
|
||||
estim_prev, self.quantifier.classes_
|
||||
return ExtMulPrev(
|
||||
estim_prev,
|
||||
e_inst.nbcl,
|
||||
q_classes=self.quantifier.classes_,
|
||||
extpol=self.extpol,
|
||||
)
|
||||
if self.extpol.collapse_false:
|
||||
estim_prev = np.insert(estim_prev, 2, 0.0)
|
||||
|
||||
return estim_prev
|
||||
|
||||
def _check_prevalence_classes(self, estim_prev, estim_classes) -> np.ndarray:
|
||||
true_classes = self.e_train.classes_
|
||||
for _cls in true_classes:
|
||||
if _cls not in estim_classes:
|
||||
estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)
|
||||
return estim_prev
|
||||
|
||||
@property
|
||||
def collapse_false(self):
|
||||
return self.extpol.collapse_false
|
||||
|
||||
@property
|
||||
def group_false(self):
|
||||
return self.extpol.group_false
|
||||
|
||||
|
||||
class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
||||
def __init__(
|
||||
|
@ -180,6 +205,8 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
|||
classifier: BaseEstimator,
|
||||
quantifier: BaseAccuracyEstimator,
|
||||
confidence: str = None,
|
||||
group_false: bool = False,
|
||||
dense: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
classifier=classifier,
|
||||
|
@ -187,6 +214,10 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
|||
confidence=confidence,
|
||||
)
|
||||
self.quantifiers = []
|
||||
self.extpol = ExtensionPolicy(
|
||||
group_false=group_false,
|
||||
dense=dense,
|
||||
)
|
||||
|
||||
def fit(self, train: LabelledCollection | ExtendedCollection):
|
||||
pred_proba = self.classifier.predict_proba(train.X)
|
||||
|
@ -215,9 +246,14 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
|||
norms = [s_i.shape[0] / len(e_inst) for s_i in s_inst]
|
||||
estim_prevs = self._quantify_helper(s_inst, norms)
|
||||
|
||||
# estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten()
|
||||
estim_prev = np.concatenate(estim_prevs.T)
|
||||
return estim_prev
|
||||
# estim_prev = np.concatenate(estim_prevs.T)
|
||||
# return ExtendedPrev(estim_prev, e_inst.nbcl, extpol=self.extpol)
|
||||
return ExtBinPrev(
|
||||
estim_prevs,
|
||||
e_inst.nbcl,
|
||||
q_classes=[quant.classes_ for quant in self.quantifiers],
|
||||
extpol=self.extpol,
|
||||
)
|
||||
|
||||
def _quantify_helper(
|
||||
self,
|
||||
|
@ -229,9 +265,14 @@ class BinaryQuantifierAccuracyEstimator(ConfidenceBasedAccuracyEstimator):
|
|||
if inst.shape[0] > 0:
|
||||
estim_prevs.append(quant.quantify(inst) * norm)
|
||||
else:
|
||||
estim_prevs.append(np.asarray([0.0, 0.0]))
|
||||
estim_prevs.append(np.zeros((len(quant.classes_),)))
|
||||
|
||||
return np.array(estim_prevs)
|
||||
# return np.array(estim_prevs)
|
||||
return estim_prevs
|
||||
|
||||
@property
|
||||
def group_false(self):
|
||||
return self.extpol.group_false
|
||||
|
||||
|
||||
BAE = BaseAccuracyEstimator
|
||||
|
|
|
@ -16,7 +16,7 @@ from quapy.protocol import (
|
|||
import quacc as qc
|
||||
import quacc.error
|
||||
from quacc.data import ExtendedCollection
|
||||
from quacc.evaluation import evaluate
|
||||
from quacc.evaluation.evaluate import evaluate
|
||||
from quacc.logger import logger
|
||||
from quacc.method.base import (
|
||||
BaseAccuracyEstimator,
|
||||
|
@ -194,14 +194,16 @@ class GridSearchAE(BaseAccuracyEstimator):
|
|||
f"\tException: {e}",
|
||||
level=1,
|
||||
)
|
||||
raise e
|
||||
# raise e
|
||||
score = None
|
||||
|
||||
return params, score, model
|
||||
|
||||
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
|
||||
def extend(
|
||||
self, coll: LabelledCollection, pred_proba=None, prefit=False
|
||||
) -> ExtendedCollection:
|
||||
assert hasattr(self, "best_model_"), "quantify called before fit"
|
||||
return self.best_model().extend(coll, pred_proba=pred_proba)
|
||||
return self.best_model().extend(coll, pred_proba=pred_proba, prefit=prefit)
|
||||
|
||||
def estimate(self, instances):
|
||||
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
|
||||
|
@ -392,6 +394,19 @@ class SpiderSearchAE(GridSearchAE):
|
|||
[(params, training) for params in _hyper],
|
||||
parallel=parallel,
|
||||
)
|
||||
|
||||
# if all scores are None, select a new random batch
|
||||
if all([s[1] is None for s in _iter_scores]):
|
||||
rand_index = np.arange(len(_hyper_remaining))
|
||||
np.random.shuffle(rand_index)
|
||||
rand_index = rand_index[:batch_size]
|
||||
remaining_index = np.setdiff1d(
|
||||
np.arange(len(_hyper_remaining)), rand_index
|
||||
)
|
||||
_hyper = _hyper_remaining[rand_index]
|
||||
_hyper_remaining = _hyper_remaining[remaining_index]
|
||||
continue
|
||||
|
||||
_sorted_idx = np.argsort(
|
||||
[1.0 if s is None else s for _, s, _ in _iter_scores]
|
||||
)
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
from pathlib import Path
|
||||
from re import X
|
||||
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from cycler import cycler
|
||||
from sklearn import base
|
||||
|
||||
from quacc import utils
|
||||
from quacc.plot.base import BasePlot
|
||||
|
@ -48,10 +50,15 @@ class MplPlot(BasePlot):
|
|||
cm = plt.get_cmap("tab20")
|
||||
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
|
||||
|
||||
base_prevs = base_prevs[:, pos_class]
|
||||
# base_prevs = base_prevs[:, pos_class]
|
||||
if isinstance(base_prevs[0], float):
|
||||
base_prevs = np.around([(1 - bp, bp) for bp in base_prevs], decimals=4)
|
||||
str_base_prevs = [str(tuple(bp)) for bp in base_prevs]
|
||||
# xticks = [str(bp) for bp in base_prevs]
|
||||
xticks = np.arange(len(base_prevs))
|
||||
for method, deltas, _cy in zip(columns, data, cy):
|
||||
ax.plot(
|
||||
base_prevs,
|
||||
xticks,
|
||||
deltas,
|
||||
label=method,
|
||||
color=_cy["color"],
|
||||
|
@ -67,7 +74,7 @@ class MplPlot(BasePlot):
|
|||
np.where(deltas != np.nan)[0],
|
||||
np.where(stdev != np.nan)[0],
|
||||
)
|
||||
_bps, _ds, _st = base_prevs[nn_idx], deltas[nn_idx], stdev[nn_idx]
|
||||
_bps, _ds, _st = xticks[nn_idx], deltas[nn_idx], stdev[nn_idx]
|
||||
ax.fill_between(
|
||||
_bps,
|
||||
_ds - _st,
|
||||
|
@ -76,6 +83,15 @@ class MplPlot(BasePlot):
|
|||
alpha=0.25,
|
||||
)
|
||||
|
||||
def format_fn(tick_val, tick_pos):
|
||||
if int(tick_val) in xticks:
|
||||
return str_base_prevs[int(tick_val)]
|
||||
|
||||
return ""
|
||||
|
||||
ax.xaxis.set_major_locator(plt.MaxNLocator(nbins=6, integer=True, prune="both"))
|
||||
ax.xaxis.set_major_formatter(format_fn)
|
||||
|
||||
ax.set(
|
||||
xlabel=f"{x_label} prevalence",
|
||||
ylabel=y_label,
|
||||
|
@ -187,7 +203,7 @@ class MplPlot(BasePlot):
|
|||
cm = plt.get_cmap("tab20")
|
||||
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
|
||||
|
||||
shift_prevs = shift_prevs[:, pos_class]
|
||||
# shift_prevs = shift_prevs[:, pos_class]
|
||||
for method, shifts, _cy in zip(columns, data, cy):
|
||||
ax.plot(
|
||||
shift_prevs,
|
||||
|
|
|
@ -69,7 +69,9 @@ class PlotlyPlot(BasePlot):
|
|||
legend=True,
|
||||
) -> go.Figure:
|
||||
fig = go.Figure()
|
||||
x = base_prevs[:, pos_class]
|
||||
if isinstance(base_prevs[0], float):
|
||||
base_prevs = np.around([(1 - bp, bp) for bp in base_prevs], decimals=4)
|
||||
x = [str(tuple(bp)) for bp in base_prevs]
|
||||
line_colors = self.get_colors(len(columns))
|
||||
for name, delta in zip(columns, data):
|
||||
color = next(line_colors)
|
||||
|
@ -177,7 +179,8 @@ class PlotlyPlot(BasePlot):
|
|||
legend=True,
|
||||
) -> go.Figure:
|
||||
fig = go.Figure()
|
||||
x = shift_prevs[:, pos_class]
|
||||
# x = shift_prevs[:, pos_class]
|
||||
x = shift_prevs
|
||||
line_colors = self.get_colors(len(columns))
|
||||
for name, delta in zip(columns, data):
|
||||
col_idx = (columns == name).nonzero()[0][0]
|
||||
|
|
Loading…
Reference in New Issue