80 lines
2.8 KiB
Python
80 lines
2.8 KiB
Python
from collections import defaultdict
|
|
import numpy as np
|
|
import itertools
|
|
|
|
|
|
def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False):
|
|
s = np.linspace(0., 1., n_prevalences, endpoint=True)
|
|
s = [s] * (dimensions - 1)
|
|
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p)<=1]
|
|
if return_constrained_dim:
|
|
prevs = [p+(1-sum(p),) for p in prevs]
|
|
prevs = np.asarray(prevs).reshape(len(prevs), -1)
|
|
if repeat>1:
|
|
prevs = np.repeat(prevs, repeat, axis=0)
|
|
return prevs
|
|
|
|
|
|
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
|
"""
|
|
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
|
|
and with the limits smoothed, i.e.:
|
|
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
|
|
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
|
|
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
|
|
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
|
|
:return: an array of uniformly separated prevalence values
|
|
"""
|
|
p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
|
|
p[0] += smooth_limits_epsilon
|
|
p[-1] -= smooth_limits_epsilon
|
|
if p[0] > p[1]:
|
|
raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
|
|
if repeat > 1:
|
|
p = np.repeat(p, repeat)
|
|
return p
|
|
|
|
|
|
def prevalence_from_labels(labels, n_classes):
|
|
unique, counts = np.unique(labels, return_counts=True)
|
|
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
|
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
|
prevalences /= prevalences.sum()
|
|
return prevalences
|
|
|
|
|
|
def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
|
if binarize:
|
|
predictions = np.argmax(posteriors, axis=-1)
|
|
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
|
else:
|
|
prevalences = posteriors.mean(axis=0)
|
|
prevalences /= prevalences.sum()
|
|
return prevalences
|
|
|
|
|
|
def strprev(prevalences, prec=3):
|
|
return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']'
|
|
|
|
|
|
def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
|
|
den = tpr - fpr
|
|
if den == 0:
|
|
den += 1e-8
|
|
adjusted = (prevalence_estim - fpr) / den
|
|
if clip:
|
|
adjusted = np.clip(adjusted, 0., 1.)
|
|
return adjusted
|
|
|
|
|
|
def normalize_prevalence(prevalences):
|
|
assert prevalences.ndim==1, 'unexpected shape'
|
|
accum = prevalences.sum()
|
|
if accum > 0:
|
|
return prevalences / accum
|
|
else:
|
|
# if all classifiers are trivial rejectors
|
|
return np.ones_like(prevalences) / prevalences.size
|
|
|
|
|