Compare commits
88 Commits
|
|
@ -0,0 +1,3 @@
|
|||
[submodule "result_table"]
|
||||
path = result_table
|
||||
url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
- Things to try:
|
||||
- Why not optmize the calibration of the classifier, instead of the classifier as a component of the quantifier?
|
||||
- init chain helps? [seems irrelevant in MAPLS...]
|
||||
- Aitchison kernel is better?
|
||||
- other classifiers?
|
||||
- optimize classifier?
|
||||
- use all datasets?
|
||||
- improve KDE on wine-quality?
|
||||
- Add other methods that natively provide uncertainty quantification methods?
|
||||
Ratio estimator
|
||||
Card & Smith
|
||||
- MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever)
|
||||
- Implement Interval Score or Winkler Score
|
||||
- analyze across shift
|
||||
- add Bayesian EM:
|
||||
- https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/mapls.py
|
||||
- take this opportunity to add RLLS:
|
||||
https://github.com/Angie-Liu/labelshift
|
||||
https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/rlls.py
|
||||
- add CIFAR10 and MNIST? Maybe consider also previously tested types of shift (tweak-one-out, etc.)? from RLLS paper
|
||||
- https://github.com/Angie-Liu/labelshift/tree/master
|
||||
- https://github.com/Angie-Liu/labelshift/blob/master/cifar10_for_labelshift.py
|
||||
- Note: MNIST is downloadable from https://archive.ics.uci.edu/dataset/683/mnist+database+of+handwritten+digits
|
||||
- Seem to be some pretrained models in:
|
||||
https://github.com/geifmany/cifar-vgg
|
||||
https://github.com/EN10/KerasMNIST
|
||||
https://github.com/tohinz/SVHN-Classifier
|
||||
- consider prior knowledge in experiments:
|
||||
- One scenario in which our prior is uninformative (i.e., uniform)
|
||||
- One scenario in which our prior is wrong (e.g., alpha-prior = (3,2,1), protocol-prior=(1,1,5))
|
||||
- One scenario in which our prior is very good (e.g., alpha-prior = (3,2,1), protocol-prior=(3,2,1))
|
||||
- Do all my baseline methods come with the option to inform a prior?
|
||||
- consider different bandwidths within the bayesian approach?
|
||||
- how to improve the coverage (or how to increase the amplitude)?
|
||||
- Added temperature-calibration, improve things.
|
||||
- Is temperature-calibration actually not equivalent to using a larger bandwidth in the kernels?
|
||||
- consider W as a measure of quantification error (the current e.g., w-CI is the winkler...)
|
||||
- optimize also C and class_weight? [I don't think so, but could be done easily now]
|
||||
|
||||
- remove wikis from repo
|
||||
|
|
@ -0,0 +1,351 @@
|
|||
from sklearn.base import BaseEstimator
|
||||
import numpy as np
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
from BayesianKDEy.commons import ILRtransformation, in_simplex
|
||||
from quapy.method._kdey import KDEBase
|
||||
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
|
||||
from quapy.functional import CLRtransformation
|
||||
from quapy.method.aggregative import AggregativeSoftQuantifier
|
||||
from tqdm import tqdm
|
||||
import quapy.functional as F
|
||||
import emcee
|
||||
from collections.abc import Iterable
|
||||
from numbers import Number
|
||||
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
from jax.scipy.special import logsumexp
|
||||
import numpyro
|
||||
import numpyro.distributions as dist
|
||||
from numpyro.infer import MCMC, NUTS
|
||||
|
||||
|
||||
class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
|
||||
"""
|
||||
`Bayesian version of KDEy.
|
||||
|
||||
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
|
||||
the one indicated in `qp.environ['DEFAULT_CLS']`
|
||||
:param val_split: specifies the data used for generating classifier predictions. This specification
|
||||
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
|
||||
be extracted from the training set; or as an integer (default 5), indicating that the predictions
|
||||
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
||||
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
|
||||
None when the method does not require any validation data, in order to avoid that some portion of
|
||||
the training data be wasted.
|
||||
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
|
||||
:param num_samples: number of samples to draw from the posterior (default 1000)
|
||||
:param mcmc_seed: random seed for the MCMC sampler (default 0)
|
||||
:param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95)
|
||||
:param region: string, set to `intervals` for constructing confidence intervals (default), or to
|
||||
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
|
||||
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
|
||||
:param prior: an array-list with the alpha parameters of a Dirichlet prior, or the string 'uniform'
|
||||
for a uniform, uninformative prior (default)
|
||||
:param verbose: bool, whether to display progress bar
|
||||
"""
|
||||
def __init__(self,
|
||||
classifier: BaseEstimator=None,
|
||||
fit_classifier=True,
|
||||
val_split: int = 5,
|
||||
kernel='gaussian',
|
||||
bandwidth=0.1,
|
||||
num_warmup: int = 500,
|
||||
num_samples: int = 1_000,
|
||||
mcmc_seed: int = 0,
|
||||
confidence_level: float = 0.95,
|
||||
region: str = 'intervals',
|
||||
explore='simplex',
|
||||
step_size=0.05,
|
||||
temperature=1.,
|
||||
engine='numpyro',
|
||||
prior='uniform',
|
||||
reduce=None,
|
||||
verbose: bool = False,
|
||||
**kwargs):
|
||||
|
||||
if num_warmup <= 0:
|
||||
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
|
||||
if num_samples <= 0:
|
||||
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
||||
assert explore in ['simplex', 'clr', 'ilr'], \
|
||||
f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"'
|
||||
assert ((isinstance(prior, str) and prior == 'uniform') or
|
||||
(isinstance(prior, Iterable) and all(isinstance(v, Number) for v in prior))), \
|
||||
f'wrong type for {prior=}; expected "uniform" or an array-like of real values'
|
||||
# assert temperature>0., f'temperature must be >0'
|
||||
assert engine in ['rw-mh', 'emcee', 'numpyro']
|
||||
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
assert all(k.startswith('classifier__') for k in kwargs.keys()), 'unexpected kwargs; must start with "classifier__"'
|
||||
self.classifier.set_params(**{k.replace('classifier__', ''):v for k,v in kwargs.items()}) # <- improve
|
||||
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
|
||||
self.kernel = self._check_kernel(kernel)
|
||||
self.num_warmup = num_warmup
|
||||
self.num_samples = num_samples
|
||||
self.mcmc_seed = mcmc_seed
|
||||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.explore = explore
|
||||
self.step_size = step_size
|
||||
self.temperature = temperature
|
||||
self.engine = engine
|
||||
self.prior = prior
|
||||
self.reduce = reduce
|
||||
self.verbose = verbose
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
if self.reduce is not None:
|
||||
self.pca = PCA(n_components=self.reduce)
|
||||
classif_predictions = self.pca.fit_transform(classif_predictions)
|
||||
#print(f'reduce to ', classif_predictions.shape)
|
||||
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
|
||||
#print('num mix ', len(self.mix_densities))
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_predictions: np.ndarray):
|
||||
if hasattr(self, 'pca'):
|
||||
classif_predictions = self.pca.transform(classif_predictions)
|
||||
|
||||
if self.engine == 'rw-mh':
|
||||
if self.prior != 'uniform':
|
||||
raise RuntimeError('prior is not yet implemented in rw-mh')
|
||||
self.prevalence_samples = self._bayesian_kde(classif_predictions, init=None, verbose=self.verbose)
|
||||
elif self.engine == 'emcee':
|
||||
if self.prior != 'uniform':
|
||||
raise RuntimeError('prior is not yet implemented in emcee')
|
||||
self.prevalence_samples = self._bayesian_emcee(classif_predictions)
|
||||
elif self.engine == 'numpyro':
|
||||
self.ilr = ILRtransformation(jax_mode=True)
|
||||
self.prevalence_samples = self._bayesian_numpyro(classif_predictions)
|
||||
return self.prevalence_samples.mean(axis=0)
|
||||
|
||||
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
classif_predictions = self.classify(instances)
|
||||
point_estimate = self.aggregate(classif_predictions)
|
||||
samples = self.prevalence_samples # available after calling "aggregate" function
|
||||
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
|
||||
return point_estimate, region
|
||||
|
||||
def _bayesian_kde(self, X_probs, init=None, verbose=False):
|
||||
"""
|
||||
Bayes:
|
||||
P(prev|data) = P(data|prev) P(prev) / P(data)
|
||||
i.e.,
|
||||
posterior = likelihood * prior / evidence
|
||||
we assume the likelihood be:
|
||||
prev @ [kde_i_likelihood(data) 1..i..n]
|
||||
prior be uniform in simplex
|
||||
"""
|
||||
|
||||
rng = np.random.default_rng(self.mcmc_seed)
|
||||
kdes = self.mix_densities
|
||||
test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes])
|
||||
|
||||
def log_likelihood(prev, epsilon=1e-10):
|
||||
test_likelihoods = prev @ test_densities
|
||||
test_loglikelihood = np.log(test_likelihoods + epsilon)
|
||||
return (1./self.temperature) * np.sum(test_loglikelihood)
|
||||
|
||||
# def log_prior(prev):
|
||||
# todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence)
|
||||
# return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant
|
||||
|
||||
# def log_prior(prev, alpha_scale=1000):
|
||||
# alpha = np.array(init) * alpha_scale
|
||||
# return dirichlet.logpdf(prev, alpha)
|
||||
|
||||
def log_prior(prev):
|
||||
return 0
|
||||
|
||||
def sample_neighbour(prev, step_size):
|
||||
# random-walk Metropolis-Hastings
|
||||
d = len(prev)
|
||||
neighbour = None
|
||||
if self.explore=='simplex':
|
||||
dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d)
|
||||
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
|
||||
elif self.explore=='clr':
|
||||
clr = CLRtransformation()
|
||||
clr_point = clr(prev)
|
||||
dir_noise = rng.normal(scale=step_size, size=d)
|
||||
clr_neighbour = clr_point+dir_noise
|
||||
neighbour = clr.inverse(clr_neighbour)
|
||||
assert in_simplex(neighbour), 'wrong CLR transformation'
|
||||
elif self.explore=='ilr':
|
||||
ilr = ILRtransformation()
|
||||
ilr_point = ilr(prev)
|
||||
dir_noise = rng.normal(scale=step_size, size=d-1)
|
||||
ilr_neighbour = ilr_point + dir_noise
|
||||
neighbour = ilr.inverse(ilr_neighbour)
|
||||
assert in_simplex(neighbour), 'wrong ILR transformation'
|
||||
return neighbour
|
||||
|
||||
n_classes = X_probs.shape[1]
|
||||
current_prev = F.uniform_prevalence(n_classes) if init is None else init
|
||||
current_likelihood = log_likelihood(current_prev) + log_prior(current_prev)
|
||||
|
||||
# Metropolis-Hastings with adaptive rate
|
||||
step_size = self.step_size
|
||||
target_acceptance = 0.3
|
||||
adapt_rate = 0.05
|
||||
acceptance_history = []
|
||||
|
||||
samples = []
|
||||
total_steps = self.num_samples + self.num_warmup
|
||||
for i in tqdm(range(total_steps), total=total_steps, disable=not verbose):
|
||||
proposed_prev = sample_neighbour(current_prev, step_size)
|
||||
|
||||
# probability of acceptance
|
||||
proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev)
|
||||
acceptance = proposed_likelihood - current_likelihood
|
||||
|
||||
# decide acceptance
|
||||
accepted = np.log(rng.random()) < acceptance
|
||||
if accepted:
|
||||
current_prev = proposed_prev
|
||||
current_likelihood = proposed_likelihood
|
||||
|
||||
samples.append(current_prev)
|
||||
acceptance_history.append(1. if accepted else 0.)
|
||||
|
||||
# if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100:
|
||||
if i % 10 == 0 and len(acceptance_history) >= 100:
|
||||
recent_accept_rate = np.mean(acceptance_history[-100:])
|
||||
step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance))
|
||||
# step_size = float(np.clip(step_size, min_step, max_step))
|
||||
if i %100==0:
|
||||
print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}')
|
||||
|
||||
# remove "warmup" initial iterations
|
||||
samples = np.asarray(samples[self.num_warmup:])
|
||||
return samples
|
||||
|
||||
def _bayesian_emcee(self, X_probs):
|
||||
ndim = X_probs.shape[1]
|
||||
nwalkers = 32
|
||||
if nwalkers < (ndim*2):
|
||||
nwalkers = ndim * 2 + 1
|
||||
|
||||
f = CLRtransformation()
|
||||
|
||||
def log_likelihood(unconstrained, test_densities, epsilon=1e-10):
|
||||
prev = f.inverse(unconstrained)
|
||||
test_likelihoods = prev @ test_densities
|
||||
test_loglikelihood = np.log(test_likelihoods + epsilon)
|
||||
return np.sum(test_loglikelihood)
|
||||
|
||||
kdes = self.mix_densities
|
||||
test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes])
|
||||
# test_densities_unconstrained = [f(t) for t in test_densities]
|
||||
|
||||
# p0 = np.random.normal(nwalkers, ndim)
|
||||
p0 = F.uniform_prevalence_sampling(ndim, nwalkers)
|
||||
p0 = f(p0)
|
||||
sampler = emcee.EnsembleSampler(nwalkers, ndim, log_likelihood, args=[test_densities])
|
||||
|
||||
state = sampler.run_mcmc(p0, self.num_warmup, skip_initial_state_check=True)
|
||||
sampler.reset()
|
||||
sampler.run_mcmc(state, self.num_samples, skip_initial_state_check=True)
|
||||
samples = sampler.get_chain(flat=True)
|
||||
samples = f.inverse(samples)
|
||||
return samples
|
||||
|
||||
def _bayesian_numpyro(self, X_probs):
|
||||
#print("bayesian_numpyro", X_probs.shape)
|
||||
kdes = self.mix_densities
|
||||
# test_densities = np.asarray(
|
||||
# [self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes]
|
||||
# )
|
||||
test_log_densities = np.asarray(
|
||||
[self.pdf(kde_i, X_probs, self.kernel, log_densities=True) for kde_i in kdes]
|
||||
)
|
||||
#print(f'min={np.min(test_log_densities)}')
|
||||
#print(f'max={np.max(test_log_densities)}')
|
||||
#print("bayesian_numpyro", test_log_densities.shape)
|
||||
#print("len kdes ", len(kdes))
|
||||
# import sys
|
||||
# sys.exit(0)
|
||||
|
||||
n_classes = len(kdes)
|
||||
if isinstance(self.prior, str) and self.prior == 'uniform':
|
||||
alpha = [1.] * n_classes
|
||||
else:
|
||||
alpha = self.prior
|
||||
|
||||
kernel = NUTS(self._numpyro_model)
|
||||
mcmc = MCMC(
|
||||
kernel,
|
||||
num_warmup=self.num_warmup,
|
||||
num_samples=self.num_samples,
|
||||
num_chains=1,
|
||||
progress_bar=self.verbose,
|
||||
)
|
||||
|
||||
rng_key = jax.random.PRNGKey(self.mcmc_seed)
|
||||
mcmc.run(rng_key, test_log_densities=test_log_densities, alpha=alpha)
|
||||
|
||||
samples_z = mcmc.get_samples()["z"]
|
||||
|
||||
# back to simplex
|
||||
samples_prev = np.asarray(self.ilr.inverse(np.asarray(samples_z)))
|
||||
|
||||
return samples_prev
|
||||
|
||||
def _numpyro_model(self, test_log_densities, alpha):
|
||||
"""
|
||||
test_densities: shape (n_classes, n_instances,)
|
||||
"""
|
||||
n_classes = test_log_densities.shape[0]
|
||||
|
||||
# sample in unconstrained R^(n_classes-1)
|
||||
z = numpyro.sample(
|
||||
"z",
|
||||
dist.Normal(0.0, 1.0).expand([n_classes - 1])
|
||||
)
|
||||
|
||||
prev = self.ilr.inverse(z) # simplex, shape (n_classes,)
|
||||
|
||||
# for numerical stability:
|
||||
# eps = 1e-10
|
||||
# prev_safe = jnp.clip(prev, eps, 1.0)
|
||||
# prev_safe = prev_safe / jnp.sum(prev_safe)
|
||||
# prev = prev_safe
|
||||
|
||||
# from jax import debug
|
||||
# debug.print("prev = {}", prev)
|
||||
# numpyro.factor(
|
||||
# "check_prev",
|
||||
# jnp.where(jnp.all(prev > 0), 0.0, -jnp.inf)
|
||||
# )
|
||||
|
||||
# prior
|
||||
if alpha is not None:
|
||||
alpha = jnp.asarray(alpha)
|
||||
numpyro.factor(
|
||||
'log_prior', dist.Dirichlet(alpha).log_prob(prev)
|
||||
)
|
||||
# if alpha is None, then this corresponds to a weak logistic-normal prior
|
||||
|
||||
# likelihood
|
||||
# test_densities = jnp.array(test_densities)
|
||||
# likelihoods = jnp.dot(prev, test_densities)
|
||||
# likelihoods = jnp.clip(likelihoods, 1e-12, jnp.inf) # numerical stability
|
||||
# numpyro.factor(
|
||||
# "loglik", (1.0 / self.temperature) * jnp.sum(jnp.log(likelihoods))
|
||||
# )
|
||||
|
||||
log_likelihood = jnp.sum(
|
||||
logsumexp(jnp.log(prev)[:, None] + test_log_densities, axis=0)
|
||||
)
|
||||
numpyro.factor(
|
||||
"loglik", (1.0 / self.temperature) * log_likelihood
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,322 @@
|
|||
import jax.numpy as jnp
|
||||
import numpy as np
|
||||
import numpyro
|
||||
import numpyro.distributions as dist
|
||||
from numpyro.infer import MCMC, NUTS, HMC
|
||||
import jax.random as random
|
||||
from sklearn.base import BaseEstimator
|
||||
from jax.scipy.special import logsumexp
|
||||
from BayesianKDEy.commons import ILRtransformation
|
||||
from quapy.method.aggregative import AggregativeSoftQuantifier
|
||||
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
|
||||
import quapy.functional as F
|
||||
|
||||
|
||||
class BayesianMAPLS(AggregativeSoftQuantifier, WithConfidenceABC):
|
||||
"""
|
||||
|
||||
:param classifier:
|
||||
:param fit_classifier:
|
||||
:param val_split:
|
||||
:param exact_train_prev: set to True (default) for using the true training prevalence as the initial
|
||||
observation; set to False for computing the training prevalence as an estimate of it, i.e., as the
|
||||
expected value of the posterior probabilities of the training instances.
|
||||
:param num_samples:
|
||||
:param mcmc_seed:
|
||||
:param confidence_level:
|
||||
:param region:
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
classifier: BaseEstimator = None,
|
||||
fit_classifier=True,
|
||||
val_split: int = 5,
|
||||
exact_train_prev=True,
|
||||
num_warmup: int = 500,
|
||||
num_samples: int = 1_000,
|
||||
mcmc_seed: int = 0,
|
||||
confidence_level: float = 0.95,
|
||||
region: str = 'intervals',
|
||||
temperature=1.,
|
||||
prior='uniform',
|
||||
mapls_chain_init=True,
|
||||
verbose=False
|
||||
):
|
||||
|
||||
if num_samples <= 0:
|
||||
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.exact_train_prev = exact_train_prev
|
||||
self.num_warmup = num_warmup
|
||||
self.num_samples = num_samples
|
||||
self.mcmc_seed = mcmc_seed
|
||||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.temperature = temperature
|
||||
self.prior = prior
|
||||
self.mapls_chain_init = mapls_chain_init
|
||||
self.verbose = verbose
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
self.train_post = classif_predictions
|
||||
if self.exact_train_prev:
|
||||
self.train_prevalence = F.prevalence_from_labels(labels, classes=self.classes_)
|
||||
else:
|
||||
self.train_prevalence = F.prevalence_from_probabilities(classif_predictions)
|
||||
self.ilr = ILRtransformation(jax_mode=True)
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_predictions: np.ndarray):
|
||||
n_test, n_classes = classif_predictions.shape
|
||||
|
||||
pi_star, lam = mapls(
|
||||
self.train_post,
|
||||
test_probs=classif_predictions,
|
||||
pz=self.train_prevalence,
|
||||
return_lambda=True
|
||||
)
|
||||
|
||||
# pi_star: MAP in simplex shape (n_classes,) and convert to ILR space
|
||||
z0 = self.ilr(pi_star)
|
||||
|
||||
if self.prior == 'uniform':
|
||||
alpha = [1.] * n_classes
|
||||
elif self.prior == 'map':
|
||||
alpha_0 = alpha0_from_lamda(lam, n_test=n_test, n_classes=n_classes)
|
||||
alpha = [alpha_0] * n_classes
|
||||
elif self.prior == 'map2':
|
||||
lam2 = get_lamda(
|
||||
test_probs=classif_predictions,
|
||||
pz=self.train_prevalence,
|
||||
q_prior=pi_star,
|
||||
dvg=kl_div
|
||||
)
|
||||
alpha_0 = alpha0_from_lamda(lam2, n_test=n_test, n_classes=n_classes)
|
||||
alpha = [alpha_0] * n_classes
|
||||
else:
|
||||
alpha = self.prior
|
||||
|
||||
kernel = NUTS(self.model)
|
||||
mcmc = MCMC(
|
||||
kernel,
|
||||
num_warmup=self.num_warmup,
|
||||
num_samples=self.num_samples,
|
||||
num_chains=1,
|
||||
progress_bar=self.verbose
|
||||
)
|
||||
|
||||
mcmc.run(
|
||||
random.PRNGKey(self.mcmc_seed),
|
||||
test_posteriors=classif_predictions,
|
||||
alpha=alpha,
|
||||
init_params={"z": z0} if self.mapls_chain_init else None
|
||||
)
|
||||
|
||||
samples = mcmc.get_samples()["z"]
|
||||
self.prevalence_samples = self.ilr.inverse(samples)
|
||||
return self.prevalence_samples.mean(axis=0)
|
||||
|
||||
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
classif_predictions = self.classify(instances)
|
||||
point_estimate = self.aggregate(classif_predictions)
|
||||
samples = self.prevalence_samples # available after calling "aggregate" function
|
||||
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
|
||||
return point_estimate, region
|
||||
|
||||
def log_likelihood(self, test_classif, test_prev, train_prev):
|
||||
# n_test = test_classif.shape[0]
|
||||
log_w = jnp.log(test_prev) - jnp.log(train_prev)
|
||||
# return (1/n_test) * jnp.sum(
|
||||
# logsumexp(jnp.log(test_classif) + log_w, axis=-1)
|
||||
# )
|
||||
return jnp.sum(
|
||||
logsumexp(jnp.log(test_classif) + log_w, axis=-1)
|
||||
)
|
||||
|
||||
def model(self, test_posteriors, alpha):
|
||||
test_posteriors = jnp.array(test_posteriors)
|
||||
n_classes = test_posteriors.shape[1]
|
||||
|
||||
# prior in ILR
|
||||
z = numpyro.sample(
|
||||
"z",
|
||||
dist.Normal(jnp.zeros(n_classes-1), 1.0)
|
||||
)
|
||||
|
||||
# back to simplex
|
||||
prev = self.ilr.inverse(z)
|
||||
train_prev = jnp.array(self.train_prevalence)
|
||||
|
||||
# prior
|
||||
alpha = jnp.array(alpha)
|
||||
numpyro.factor(
|
||||
"dirichlet_prior",
|
||||
dist.Dirichlet(alpha).log_prob(prev)
|
||||
)
|
||||
|
||||
# Likelihood
|
||||
numpyro.factor(
|
||||
"likelihood",
|
||||
(1.0 / self.temperature) * self.log_likelihood(test_posteriors, test_prev=prev, train_prev=train_prev)
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://github.com/ChangkunYe/MAPLS/blob/main/label_shift/mapls.py
|
||||
|
||||
def mapls(train_probs: np.ndarray,
|
||||
test_probs: np.ndarray,
|
||||
pz: np.ndarray,
|
||||
qy_mode: str = 'soft',
|
||||
max_iter: int = 100,
|
||||
init_mode: str = 'identical',
|
||||
lam: float = None,
|
||||
dvg_name='kl',
|
||||
return_lambda=False
|
||||
):
|
||||
r"""
|
||||
Implementation of Maximum A Posteriori Label Shift,
|
||||
for Unknown target label distribution estimation
|
||||
|
||||
Given source domain P(Y_s=i|X_s=x) = f(x) and P(Y_s=i),
|
||||
estimate targe domain P(Y_t=i) on test set
|
||||
|
||||
"""
|
||||
# Sanity Check
|
||||
cls_num = len(pz)
|
||||
assert test_probs.shape[-1] == cls_num
|
||||
if type(max_iter) != int or max_iter < 0:
|
||||
raise Exception('max_iter should be a positive integer, not ' + str(max_iter))
|
||||
|
||||
# Setup d(p,q) measure
|
||||
if dvg_name == 'kl':
|
||||
dvg = kl_div
|
||||
elif dvg_name == 'js':
|
||||
dvg = js_div
|
||||
else:
|
||||
raise Exception('Unsupported distribution distance measure, expect kl or js.')
|
||||
|
||||
# Set Prior of Target Label Distribution
|
||||
q_prior = np.ones(cls_num) / cls_num
|
||||
# q_prior = pz.copy()
|
||||
|
||||
# Lambda estimation-------------------------------------------------------#
|
||||
if lam is None:
|
||||
# logging.info('Data shape: %s, %s' % (str(train_probs.shape), str(test_probs.shape)))
|
||||
# logging.info('Divergence type is %s' % (dvg))
|
||||
lam = get_lamda(test_probs, pz, q_prior, dvg=dvg, max_iter=max_iter)
|
||||
# logging.info('Estimated lambda value is %.4f' % lam)
|
||||
# else:
|
||||
# logging.info('Assigned lambda is %.4f' % lam)
|
||||
|
||||
# EM Algorithm Computation
|
||||
qz = mapls_EM(test_probs, pz, lam, q_prior, cls_num,
|
||||
init_mode=init_mode, max_iter=max_iter, qy_mode=qy_mode)
|
||||
|
||||
if return_lambda:
|
||||
return qz, lam
|
||||
else:
|
||||
return qz
|
||||
|
||||
|
||||
def mapls_EM(probs, pz, lam, q_prior, cls_num, init_mode='identical', max_iter=100, qy_mode='soft'):
|
||||
# Normalize Source Label Distribution pz
|
||||
pz = np.array(pz) / np.sum(pz)
|
||||
# Initialize Target Label Distribution qz
|
||||
if init_mode == 'uniform':
|
||||
qz = np.ones(cls_num) / cls_num
|
||||
elif init_mode == 'identical':
|
||||
qz = pz.copy()
|
||||
else:
|
||||
raise ValueError('init_mode should be either "uniform" or "identical"')
|
||||
|
||||
# Initialize w
|
||||
w = (np.array(qz) / np.array(pz))
|
||||
# EM algorithm with MAP estimation----------------------------------------#
|
||||
for i in range(max_iter):
|
||||
# print('w shape ', w.shape)
|
||||
|
||||
# E-Step--------------------------------------------------------------#
|
||||
mapls_probs = normalized(probs * w, axis=-1, order=1)
|
||||
|
||||
# M-Step--------------------------------------------------------------#
|
||||
if qy_mode == 'hard':
|
||||
pred = np.argmax(mapls_probs, axis=-1)
|
||||
qz_new = np.bincount(pred.reshape(-1), minlength=cls_num)
|
||||
elif qy_mode == 'soft':
|
||||
qz_new = np.mean(mapls_probs, axis=0)
|
||||
# elif qy_mode == 'topk':
|
||||
# qz_new = Topk_qy(mapls_probs, cls_num, topk_ratio=0.9, head=0)
|
||||
else:
|
||||
raise Exception('mapls mode should be either "soft" or "hard". ')
|
||||
# print(np.shape(pc_probs), np.shape(pred), np.shape(cls_num_list_t))
|
||||
|
||||
# Update w with MAP estimation of Target Label Distribution qz
|
||||
# qz = (qz_new + alpha) / (N + np.sum(alpha))
|
||||
qz = lam * qz_new + (1 - lam) * q_prior
|
||||
qz /= qz.sum()
|
||||
w = qz / pz
|
||||
|
||||
return qz
|
||||
|
||||
|
||||
def get_lamda(test_probs, pz, q_prior, dvg, max_iter=50):
|
||||
K = len(pz)
|
||||
|
||||
# MLLS estimation of source and target domain label distribution
|
||||
qz_pred = mapls_EM(test_probs, pz, 1, 0, K, max_iter=max_iter)
|
||||
|
||||
TU_div = dvg(qz_pred, q_prior)
|
||||
TS_div = dvg(qz_pred, pz)
|
||||
SU_div = dvg(pz, q_prior)
|
||||
# logging.info('weights are, TU_div %.4f, TS_div %.4f, SU_div %.4f' % (TU_div, TS_div, SU_div))
|
||||
|
||||
SU_conf = 1 - lam_forward(SU_div, lam_inv(dpq=0.5, lam=0.2))
|
||||
TU_conf = lam_forward(TU_div, lam_inv(dpq=0.5, lam=SU_conf))
|
||||
TS_conf = lam_forward(TS_div, lam_inv(dpq=0.5, lam=SU_conf))
|
||||
# logging.info('weights are, unviform_weight %.4f, differ_weight %.4f, regularize weight %.4f'
|
||||
# % (TU_conf, TS_conf, SU_conf))
|
||||
|
||||
confs = np.array([TU_conf, 1 - TS_conf])
|
||||
w = np.array([0.9, 0.1])
|
||||
lam = np.sum(w * confs)
|
||||
|
||||
# logging.info('Estimated lambda is: %.4f', lam)
|
||||
|
||||
return lam
|
||||
|
||||
|
||||
def lam_inv(dpq, lam):
|
||||
return (1 / (1 - lam) - 1) / dpq
|
||||
|
||||
|
||||
def lam_forward(dpq, gamma):
|
||||
return gamma * dpq / (1 + gamma * dpq)
|
||||
|
||||
|
||||
def kl_div(p, q, eps=1e-12):
|
||||
p = np.asarray(p, dtype=float)
|
||||
q = np.asarray(q, dtype=float)
|
||||
|
||||
mask = p > 0
|
||||
return np.sum(p[mask] * np.log(p[mask] / (q[mask] + eps)))
|
||||
|
||||
|
||||
def js_div(p, q):
|
||||
assert (np.abs(np.sum(p) - 1) < 1e-6) and (np.abs(np.sum(q) - 1) < 1e-6)
|
||||
m = (p + q) / 2
|
||||
return kl_div(p, m) / 2 + kl_div(q, m) / 2
|
||||
|
||||
|
||||
def normalized(a, axis=-1, order=2):
|
||||
r"""
|
||||
Prediction Normalization
|
||||
"""
|
||||
l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
|
||||
l2[l2 == 0] = 1
|
||||
return a / np.expand_dims(l2, axis)
|
||||
|
||||
|
||||
def alpha0_from_lamda(lam, n_test, n_classes):
|
||||
return 1+n_test*(1-lam)/(lam*n_classes)
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
from sklearn.neighbors import KernelDensity
|
||||
import quapy.functional as F
|
||||
import numpy as np
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.neighbors import KernelDensity
|
||||
import quapy.functional as F
|
||||
|
||||
# aitchison=True
|
||||
aitchison=False
|
||||
clr = F.CLRtransformation()
|
||||
|
||||
# h = 0.1
|
||||
# dims = list(range(5, 100, 5))
|
||||
dims = [10, 28, 100]
|
||||
|
||||
center_densities = []
|
||||
vertex_densities = []
|
||||
|
||||
center_densities_scaled = []
|
||||
vertex_densities_scaled = []
|
||||
|
||||
|
||||
for n in dims:
|
||||
h0 = 0.4
|
||||
simplex_center = F.uniform_prevalence(n)
|
||||
simplex_vertex = np.asarray([.9] + [.1/ (n - 1)] * (n - 1), dtype=float)
|
||||
|
||||
# KDE trained on a single point (the center)
|
||||
kde = KernelDensity(bandwidth=h0)
|
||||
X = simplex_center[None, :]
|
||||
if aitchison:
|
||||
X = clr(X)
|
||||
kde.fit(X)
|
||||
|
||||
X = np.vstack([simplex_center, simplex_vertex])
|
||||
if aitchison:
|
||||
X = clr(X)
|
||||
density = np.exp(kde.score_samples(X))
|
||||
|
||||
center_densities.append(density[0])
|
||||
vertex_densities.append(density[1])
|
||||
|
||||
h1= h0 * np.sqrt(n / 2)
|
||||
# KDE trained on a single point (the center)
|
||||
kde = KernelDensity(bandwidth=h1)
|
||||
X = simplex_center[None, :]
|
||||
if aitchison:
|
||||
X = clr(X)
|
||||
kde.fit(X)
|
||||
|
||||
X = np.vstack([simplex_center, simplex_vertex])
|
||||
if aitchison:
|
||||
X = clr(X)
|
||||
density = np.exp(kde.score_samples(X))
|
||||
|
||||
center_densities_scaled.append(density[0])
|
||||
vertex_densities_scaled.append(density[1])
|
||||
|
||||
# Plot
|
||||
plt.figure(figsize=(6*4, 4*4))
|
||||
plt.plot(dims, center_densities, marker='o', label='Center of simplex')
|
||||
plt.plot(dims, vertex_densities, marker='s', label='Vertex of simplex')
|
||||
plt.plot(dims, center_densities_scaled, marker='o', label='Center of simplex (scaled)')
|
||||
plt.plot(dims, vertex_densities_scaled, marker='s', label='Vertex of simplex (scaled)')
|
||||
|
||||
plt.xlabel('Number of classes (simplex dimension)')
|
||||
# plt.ylim(min(center_densities+vertex_densities), max(center_densities+vertex_densities))
|
||||
plt.ylabel('Kernel density')
|
||||
plt.yscale('log') # crucial to see anything meaningful
|
||||
plt.title(f'KDE density vs dimension (bandwidth = {h0}) in {"Simplex" if not aitchison else "ILR-space"}')
|
||||
plt.legend()
|
||||
plt.grid(alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
|
@ -0,0 +1,195 @@
|
|||
import os
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from jax import numpy as jnp
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
import quapy.functional as F
|
||||
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
|
||||
from quapy.method.aggregative import KDEyML
|
||||
from quapy.functional import ILRtransformation
|
||||
from scipy.stats import entropy
|
||||
|
||||
RESULT_DIR = Path('results')
|
||||
|
||||
|
||||
# utils
|
||||
def experiment_path(dir:Path, dataset_name:str, method_name:str):
|
||||
os.makedirs(dir, exist_ok=True)
|
||||
return dir/f'{dataset_name}__{method_name}.pkl'
|
||||
|
||||
|
||||
def normalized_entropy(p):
|
||||
"""
|
||||
Normalized Shannon entropy in [0, 1]
|
||||
p: array-like, prevalence vector (sums to 1)
|
||||
"""
|
||||
p = np.asarray(p)
|
||||
H = entropy(p) # Shannon entropy
|
||||
H_max = np.log(len(p))
|
||||
return np.clip(H / H_max, 0, 1)
|
||||
|
||||
|
||||
def antagonistic_prevalence(p, strength=1):
|
||||
ilr = ILRtransformation()
|
||||
z = ilr(p)
|
||||
z_ant = - strength * z
|
||||
p_ant = ilr.inverse(z_ant)
|
||||
return p_ant
|
||||
|
||||
|
||||
"""
|
||||
class KDEyScaledB(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='gaussian'
|
||||
)
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
if not hasattr(self, '_changed'):
|
||||
def scale_bandwidth(n_classes, beta=0.5):
|
||||
return self.bandwidth * np.power(n_classes, beta)
|
||||
n_classes = len(set(y))
|
||||
scaled = scale_bandwidth(n_classes)
|
||||
print(f'bandwidth scaling: {self.bandwidth:.4f} => {scaled:.4f}')
|
||||
self.bandwidth = scaled
|
||||
self._changed = True
|
||||
return super().aggregation_fit(classif_predictions, labels)
|
||||
"""
|
||||
class KDEyScaledB(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='gaussian'
|
||||
)
|
||||
|
||||
class KDEyFresh(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='gaussian'
|
||||
)
|
||||
|
||||
class KDEyReduce(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., n_components=10, random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='gaussian'
|
||||
)
|
||||
self.n_components=n_components
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
self.pca = PCA(n_components=self.n_components)
|
||||
classif_predictions = self.pca.fit_transform(classif_predictions)
|
||||
return super().aggregation_fit(classif_predictions, labels)
|
||||
|
||||
def aggregate(self, posteriors: np.ndarray):
|
||||
posteriors = self.pca.transform(posteriors)
|
||||
return super().aggregate(posteriors)
|
||||
|
||||
|
||||
class KDEyCLR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='aitchison'
|
||||
)
|
||||
|
||||
class KDEyCLR2(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='aitchison'
|
||||
)
|
||||
|
||||
|
||||
class KDEyILR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='ilr'
|
||||
)
|
||||
|
||||
|
||||
class ILRtransformation(F.CompositionalTransformation):
|
||||
def __init__(self, jax_mode=False):
|
||||
self.jax_mode = jax_mode
|
||||
|
||||
def array(self, X):
|
||||
if self.jax_mode:
|
||||
return jnp.array(X)
|
||||
else:
|
||||
return np.asarray(X)
|
||||
|
||||
def __call__(self, X):
|
||||
X = self.array(X)
|
||||
X = qp.error.smooth(X, self.EPSILON)
|
||||
k = X.shape[-1]
|
||||
V = self.array(self.get_V(k))
|
||||
logp = jnp.log(X) if self.jax_mode else np.log(X)
|
||||
return logp @ V.T
|
||||
|
||||
def inverse(self, Z):
|
||||
Z = self.array(Z)
|
||||
k_minus_1 = Z.shape[-1]
|
||||
k = k_minus_1 + 1
|
||||
V = self.array(self.get_V(k))
|
||||
logp = Z @ V
|
||||
p = jnp.exp(logp) if self.jax_mode else np.exp(logp)
|
||||
p = p / jnp.sum(p, axis=-1, keepdims=True) if self.jax_mode else p / np.sum(p, axis=-1, keepdims=True)
|
||||
return p
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_V(self, k):
|
||||
def helmert_matrix(k):
|
||||
"""
|
||||
Returns the (k x k) Helmert matrix.
|
||||
"""
|
||||
H = np.zeros((k, k))
|
||||
for i in range(1, k):
|
||||
H[i, :i] = 1
|
||||
H[i, i] = -(i)
|
||||
H[i] = H[i] / np.sqrt(i * (i + 1))
|
||||
# row 0 stays zeros; will be discarded
|
||||
return H
|
||||
|
||||
def ilr_basis(k):
|
||||
"""
|
||||
Constructs an orthonormal ILR basis using the Helmert submatrix.
|
||||
Output shape: (k-1, k)
|
||||
"""
|
||||
H = helmert_matrix(k)
|
||||
V = H[1:, :] # remove first row of zeros
|
||||
return V
|
||||
|
||||
return ilr_basis(k)
|
||||
|
||||
|
||||
def in_simplex(x, atol=1e-8):
|
||||
x = np.asarray(x)
|
||||
|
||||
non_negative = np.all(x >= 0, axis=-1)
|
||||
sum_to_one = np.isclose(x.sum(axis=-1), 1.0, atol=atol)
|
||||
|
||||
return non_negative & sum_to_one
|
||||
|
||||
|
||||
class MockClassifierFromPosteriors(BaseEstimator):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, X, y):
|
||||
self.classes_ = sorted(np.unique(y))
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
return np.argmax(X, axis=1)
|
||||
|
||||
def predict_proba(self, X):
|
||||
return X
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
Dataset: mnist
|
||||
Model: basiccnn
|
||||
Number of classes: 10
|
||||
Train samples: 42000
|
||||
Validation samples: 18000
|
||||
Test samples: 10000
|
||||
Validation accuracy: 0.9895
|
||||
Test accuracy: 0.9901
|
||||
Github repository: ....
|
||||
|
||||
Data Loading Instructions:
|
||||
The extracted features, predictions, targets, and logits are saved in .npz files.
|
||||
To load the data in Python:
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Load training data
|
||||
train_data = np.load('mnist_basiccnn_train_out.npz')
|
||||
train_features = train_data['features']
|
||||
train_predictions = train_data['predictions']
|
||||
train_targets = train_data['targets']
|
||||
train_logits = train_data['logits']
|
||||
|
||||
# Load validation data
|
||||
val_data = np.load('mnist_basiccnn_val_out.npz')
|
||||
val_features = val_data['features']
|
||||
val_predictions = val_data['predictions']
|
||||
val_targets = val_data['targets']
|
||||
val_logits = val_data['logits']
|
||||
|
||||
# Load test data
|
||||
test_data = np.load('mnist_basiccnn_test_out.npz')
|
||||
test_features = test_data['features']
|
||||
test_predictions = test_data['predictions']
|
||||
test_targets = test_data['targets']
|
||||
test_logits = test_data['logits']
|
||||
|
||||
Note: features are the output of the feature extractor (before the final classification layer),
|
||||
predictions are softmax probabilities, targets are the true labels, and logits are the raw model outputs.
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,345 @@
|
|||
import inspect
|
||||
from abc import ABC, abstractmethod
|
||||
from functools import lru_cache
|
||||
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import quapy as qp
|
||||
|
||||
from commons import MockClassifierFromPosteriors
|
||||
from quapy.protocol import UPP
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import EMQ
|
||||
|
||||
|
||||
|
||||
def fetchVisual(modality, dataset, net, data_home='./data'):
|
||||
MODALITY = ('features', 'predictions', 'logits')
|
||||
assert modality in MODALITY, f'unknown modality, valid ones are {MODALITY}'
|
||||
|
||||
file_prefix = f'{dataset}_{net}'
|
||||
data_home = Path(data_home) / file_prefix
|
||||
|
||||
# Load training data
|
||||
train_data = np.load(data_home/f'{file_prefix}_train_out.npz')
|
||||
train_X = train_data[modality]
|
||||
train_y = train_data['targets']
|
||||
|
||||
# Load validation data
|
||||
val_data = np.load(data_home/f'{file_prefix}_val_out.npz')
|
||||
val_X = val_data[modality]
|
||||
val_y = val_data['targets']
|
||||
|
||||
# Load test data
|
||||
test_data = np.load(data_home/f'{file_prefix}_test_out.npz')
|
||||
test_X = test_data[modality]
|
||||
test_y = test_data['targets']
|
||||
|
||||
train = LabelledCollection(train_X, train_y)
|
||||
val = LabelledCollection(val_X, val_y, classes=train.classes_)
|
||||
test = LabelledCollection(test_X, test_y, classes=train.classes_)
|
||||
|
||||
def show_prev_stats(data:LabelledCollection):
|
||||
p = data.prevalence()
|
||||
return f'prevs in [{min(p)*100:.3f}%, {max(p)*100:.3f}%]'
|
||||
print(f'loaded {dataset} ({modality=}): '
|
||||
f'#train={len(train)}({show_prev_stats(train)}), '
|
||||
f'#val={len(val)}({show_prev_stats(val)}), '
|
||||
f'#test={len(test)}({show_prev_stats(test)}), '
|
||||
f'#features={train_X.shape[1]}, '
|
||||
f'#classes={len(set(train_y))}')
|
||||
|
||||
return train, val, test
|
||||
|
||||
|
||||
def fetchMNIST(modality, data_home='./data'):
|
||||
return fetchVisual(modality, dataset='mnist', net='basiccnn', data_home=data_home)
|
||||
|
||||
def fetchCIFAR100coarse(modality, data_home='./data'):
|
||||
return fetchVisual(modality, dataset='cifar100coarse', net='resnet18', data_home=data_home)
|
||||
|
||||
def fetchCIFAR100(modality, data_home='./data'):
|
||||
return fetchVisual(modality, dataset='cifar100', net='resnet18', data_home=data_home)
|
||||
|
||||
def fetchCIFAR10(modality, data_home='./data'):
|
||||
return fetchVisual(modality, dataset='cifar10', net='resnet18', data_home=data_home)
|
||||
|
||||
def fetchFashionMNIST(modality, data_home='./data'):
|
||||
return fetchVisual(modality, dataset='fashionmnist', net='basiccnn', data_home=data_home)
|
||||
|
||||
def fetchSVHN(modality, data_home='./data'):
|
||||
return fetchVisual(modality, dataset='svhn', net='resnet18', data_home=data_home)
|
||||
|
||||
|
||||
|
||||
class DatasetHandler(ABC):
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
@classmethod
|
||||
def get_defaults(cls):
|
||||
sig = inspect.signature(cls.__init__)
|
||||
|
||||
defaults = {
|
||||
name: param.default
|
||||
for name, param in sig.parameters.items()
|
||||
if param.default is not inspect.Parameter.empty
|
||||
}
|
||||
|
||||
return defaults
|
||||
|
||||
@abstractmethod
|
||||
def get_training(self): ...
|
||||
|
||||
@abstractmethod
|
||||
def get_train_testprot_for_eval(self): ...
|
||||
|
||||
@abstractmethod
|
||||
def get_train_valprot_for_modsel(self): ...
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def get_datasets(cls): ...
|
||||
|
||||
@classmethod
|
||||
def iter(cls, **kwargs):
|
||||
for name in cls.get_datasets():
|
||||
yield cls(name, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}({self.name})'
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_binary(self): ...
|
||||
|
||||
|
||||
class VisualDataHandler(DatasetHandler):
|
||||
|
||||
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0):
|
||||
# mode features : feature-based, the idea is to learn a LogisticRegression on top
|
||||
# mode predictions : posterior probabilities
|
||||
# assert modality in ['features', 'predictions'], f'unknown {modality=}'
|
||||
super().__init__(name=name)
|
||||
modality = 'predictions'
|
||||
if name.endswith('-f'):
|
||||
modality = 'features'
|
||||
elif name.endswith('-l'):
|
||||
modality = 'logits'
|
||||
self.modality = modality
|
||||
self.n_val_samples = n_val_samples
|
||||
self.n_test_samples = n_test_samples
|
||||
self.sample_size = sample_size
|
||||
self.random_state = random_state
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def dataset(self):
|
||||
name = self.name.lower()
|
||||
name = name.replace('-f', '')
|
||||
name = name.replace('-l', '')
|
||||
|
||||
if name=='mnist':
|
||||
data = fetchMNIST(modality=self.modality)
|
||||
elif name=='cifar100coarse':
|
||||
data = fetchCIFAR100coarse(modality=self.modality)
|
||||
elif name=='cifar100':
|
||||
data = fetchCIFAR100(modality=self.modality)
|
||||
elif name=='cifar10':
|
||||
data = fetchCIFAR10(modality=self.modality)
|
||||
elif name=='fashionmnist':
|
||||
data = fetchFashionMNIST(modality=self.modality)
|
||||
elif name=='svhn':
|
||||
data = fetchSVHN(modality=self.modality)
|
||||
else:
|
||||
raise ValueError(f'unknown dataset {name}')
|
||||
|
||||
# the training set was used to extract features;
|
||||
# we use the validation portion as a training set for quantifiers
|
||||
net_train, val, test = data
|
||||
train, val = val.split_stratified(train_prop=0.6, random_state=self.random_state)
|
||||
return train, val, test
|
||||
|
||||
def get_training(self):
|
||||
train, val, test = self.dataset()
|
||||
return train
|
||||
|
||||
def get_validation(self):
|
||||
train, val, test = self.dataset()
|
||||
return val
|
||||
|
||||
def get_train_testprot_for_eval(self):
|
||||
train, val, test = self.dataset()
|
||||
test_prot = UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples, random_state=self.random_state)
|
||||
return train+val, test_prot
|
||||
|
||||
def get_train_valprot_for_modsel(self):
|
||||
train, val, test = self.dataset()
|
||||
val_prot = UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples, random_state=self.random_state)
|
||||
return train, val_prot
|
||||
|
||||
@classmethod
|
||||
def get_datasets(cls):
|
||||
datasets = ['cifar100coarse', 'cifar10', 'mnist', 'fashionmnist', 'svhn'] #+ ['cifar100']
|
||||
# datasets_feat = [f'{d}-f' for d in datasets]
|
||||
datasets_feat = [f'{d}-l' for d in datasets]
|
||||
return datasets_feat # + datasets
|
||||
|
||||
|
||||
@classmethod
|
||||
def iter(cls, **kwargs):
|
||||
for name in cls.get_datasets():
|
||||
yield cls(name, **kwargs)
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.name}'
|
||||
|
||||
@classmethod
|
||||
def is_binary(self):
|
||||
return False
|
||||
|
||||
|
||||
class CIFAR100Handler(VisualDataHandler):
|
||||
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=2000, random_state=0):
|
||||
super().__init__(name=name, n_val_samples=n_val_samples, n_test_samples=n_test_samples, sample_size=sample_size, random_state=random_state)
|
||||
|
||||
@classmethod
|
||||
def get_datasets(cls):
|
||||
datasets = ['cifar100']
|
||||
# datasets_feat = [f'{d}-f' for d in datasets]
|
||||
datasets_feat = [f'{d}-l' for d in datasets]
|
||||
return datasets_feat # + datasets
|
||||
|
||||
|
||||
# LeQua multiclass tasks
|
||||
class LeQuaHandler(DatasetHandler):
|
||||
|
||||
DATASETS = ['LeQua2022', 'LeQua2024']
|
||||
|
||||
def __init__(self, name):
|
||||
super().__init__(name)
|
||||
self.sample_size = 1000
|
||||
|
||||
def get_training(self):
|
||||
return self.dataset()[0]
|
||||
|
||||
def get_train_testprot_for_eval(self):
|
||||
training, _, test_generator = self.dataset()
|
||||
return training, test_generator
|
||||
|
||||
def get_train_valprot_for_modsel(self):
|
||||
training, val_generator, _ = self.dataset()
|
||||
return training, val_generator
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def dataset(self):
|
||||
if self.name=='LeQua2022':
|
||||
return qp.datasets.fetch_lequa2022(task='T1B')
|
||||
elif self.name=='LeQua2024':
|
||||
return qp.datasets.fetch_lequa2024(task='T2')
|
||||
else:
|
||||
raise ValueError(f'unexpected dataset name {self.name}; valid ones are {self.DATASETS}')
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
@classmethod
|
||||
def iter(cls):
|
||||
for name in cls.DATASETS:
|
||||
yield cls(name)
|
||||
|
||||
@classmethod
|
||||
def is_binary(self):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_datasets(cls):
|
||||
return cls.DATASETS
|
||||
|
||||
|
||||
class UCIDatasetHandler(DatasetHandler, ABC):
|
||||
DATASETS = []
|
||||
|
||||
def __init__(self, name, n_val_samples, n_test_samples, sample_size, random_state):
|
||||
super().__init__(name=name)
|
||||
self.sample_size = sample_size
|
||||
self.n_val_samples = n_val_samples
|
||||
self.n_test_samples = n_test_samples
|
||||
self.random_state = random_state
|
||||
|
||||
def get_training(self):
|
||||
return self.dataset().training
|
||||
|
||||
def get_train_testprot_for_eval(self):
|
||||
training, test = self.dataset().train_test
|
||||
test_generator = qp.protocol.UPP(test, sample_size=self.sample_size, repeats=self.n_test_samples,
|
||||
random_state=self.random_state)
|
||||
return training, test_generator
|
||||
|
||||
def get_train_valprot_for_modsel(self):
|
||||
training = self.dataset().training
|
||||
training, val = training.split_stratified(train_prop=0.6, random_state=self.random_state)
|
||||
val_generator = qp.protocol.UPP(val, sample_size=self.sample_size, repeats=self.n_val_samples,
|
||||
random_state=self.random_state)
|
||||
return training, val_generator
|
||||
|
||||
@classmethod
|
||||
def get_datasets(cls):
|
||||
return cls.DATASETS
|
||||
|
||||
@classmethod
|
||||
def iter(cls):
|
||||
for name in cls.DATASETS:
|
||||
yield cls(name)
|
||||
|
||||
|
||||
class UCIMulticlassHandler(UCIDatasetHandler):
|
||||
|
||||
DATASETS = sorted([d for d in qp.datasets.UCI_MULTICLASS_DATASETS if d not in frozenset(['hcv', 'poker_hand'])])
|
||||
|
||||
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=1000, random_state=0):
|
||||
super().__init__(name, n_val_samples, n_test_samples, sample_size, random_state)
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def dataset(self):
|
||||
return qp.datasets.fetch_UCIMulticlassDataset(self.name, min_class_support=0.01)
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.name}(UCI-multiclass)'
|
||||
|
||||
@classmethod
|
||||
def is_binary(self):
|
||||
return False
|
||||
|
||||
|
||||
class UCIBinaryHandler(DatasetHandler):
|
||||
|
||||
DATASETS = qp.datasets.UCI_BINARY_DATASETS.copy()
|
||||
|
||||
def __init__(self, name, n_val_samples=100, n_test_samples=100, sample_size=500, random_state=0):
|
||||
super().__init__(name, n_val_samples, n_test_samples, sample_size, random_state)
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def dataset(self):
|
||||
return qp.datasets.fetch_UCIBinaryDataset(self.name)
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.name}(UCI-binary)'
|
||||
|
||||
|
||||
@classmethod
|
||||
def is_binary(self):
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train, val, test = fetchMNIST(modality='predictions')
|
||||
cls = MockClassifierFromPosteriors()
|
||||
cls.fit(*train.Xy)
|
||||
# q = KDEyML(classifier=cls, fit_classifier=False)
|
||||
# q = PACC(classifier=cls, fit_classifier=False)
|
||||
q = EMQ(classifier=cls, fit_classifier=False)
|
||||
q.fit(*val.Xy)
|
||||
test_prot = UPP(test, repeats=100, sample_size=500)
|
||||
report = qp.evaluation.evaluation_report(q, test_prot, verbose=True)
|
||||
print(report.mean(numeric_only=True))
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
import quapy as qp
|
||||
|
||||
from quapy.error import mae
|
||||
import quapy.functional as F
|
||||
|
||||
for n in range(2,100,5):
|
||||
a = F.uniform_prevalence_sampling(n_classes=n, size=10_000)
|
||||
b = F.uniform_prevalence_sampling(n_classes=n, size=10_000)
|
||||
print(f'{n=} ae={mae(a, b):.5f}')
|
||||
|
|
@ -0,0 +1,230 @@
|
|||
from pathlib import Path
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy.commons import KDEyReduce
|
||||
from BayesianKDEy.methods import get_experimental_methods, MethodDescriptor
|
||||
from _bayeisan_kdey import BayesianKDEy
|
||||
from _bayesian_mapls import BayesianMAPLS
|
||||
from commons import experiment_path, KDEyCLR, RESULT_DIR, MockClassifierFromPosteriors, KDEyScaledB, KDEyFresh
|
||||
# import datasets
|
||||
from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, VisualDataHandler, CIFAR100Handler
|
||||
from temperature_calibration import temp_calibration
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.data import Dataset
|
||||
from quapy.method.confidence import BayesianCC, AggregativeBootstrap
|
||||
from quapy.method.aggregative import KDEyML, ACC
|
||||
from quapy.protocol import UPP
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
|
||||
|
||||
|
||||
def methods___depr():
|
||||
"""
|
||||
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
|
||||
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
|
||||
- quantifier: is the base model (e.g., KDEyML())
|
||||
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
|
||||
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
|
||||
quantifier with optimized hyperparameters
|
||||
"""
|
||||
|
||||
Cls = LogisticRegression
|
||||
cls_hyper = {'classifier__C': np.logspace(-4,4,9), 'classifier__class_weight': ['balanced', None]}
|
||||
val_split = 5 # k-fold cross-validation
|
||||
cc_hyper = cls_hyper
|
||||
acc_hyper = cls_hyper
|
||||
# emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs'], **cls_hyper}
|
||||
hdy_hyper = {'nbins': [3,4,5,8,16,32], **cls_hyper}
|
||||
kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **cls_hyper}
|
||||
kdey_hyper_clr = {'bandwidth': np.logspace(-2, 2, 10), **cls_hyper}
|
||||
band ={'bandwidth':np.logspace(-3,-1,10)}
|
||||
multiclass_method = 'multiclass'
|
||||
only_binary = 'only_binary'
|
||||
only_multiclass = 'only_multiclass'
|
||||
|
||||
# surrogate quantifiers
|
||||
cc = CC(Cls())
|
||||
acc = ACC(Cls(), val_split=val_split)
|
||||
hdy = DMy(Cls(), val_split=val_split)
|
||||
kde_gau = KDEyML(Cls(), val_split=val_split)
|
||||
kde_gau_scale = KDEyScaledB(Cls(), val_split=val_split)
|
||||
kde_gau_pca = KDEyReduce(Cls(), val_split=val_split, n_components=5)
|
||||
kde_gau_pca10 = KDEyReduce(Cls(), val_split=val_split, n_components=10)
|
||||
kde_ait = KDEyCLR(Cls(), val_split=val_split)
|
||||
emq = EMQ(Cls(), exact_train_prev=False, val_split=val_split)
|
||||
|
||||
|
||||
# Bootstrap approaches:
|
||||
# --------------------------------------------------------------------------------------------------------
|
||||
# yield 'BootstrapCC', cc, cc_hyper, lambda hyper: AggregativeBootstrap(CC(Cls()), n_test_samples=1000, random_state=0), multiclass_method
|
||||
#yield 'BootstrapACC', acc, acc_hyper, lambda hyper: _AggregativeBootstrap(ACC(Cls()), n_test_samples=1000, random_state=0), multiclass_method
|
||||
#yield 'BootstrapEMQ', emq, on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: _AggregativeBootstrap(EMQ(Cls(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method
|
||||
#yield 'BootstrapHDy', hdy, hdy_hyper, lambda hyper: _AggregativeBootstrap(DMy(Cls(), **hyper), n_test_samples=1000, random_state=0), multiclass_method
|
||||
#yield 'BootstrapKDEy', kde_gau, kdey_hyper, lambda hyper: _AggregativeBootstrap(KDEyML(Cls(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
|
||||
|
||||
# Bayesian approaches: (*=temp calibration auto threshold and coverage sim to nominal; +=temp calibration w/o amplitude coverage, for winkler criterion, !=same but alpha=0.005 for winkler)
|
||||
# --------------------------------------------------------------------------------------------------------
|
||||
# yield 'BayesianACC', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, mcmc_seed=0), multiclass_method
|
||||
# yield 'BayesianACC*', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, temperature=None, mcmc_seed=0), multiclass_method
|
||||
# yield 'BayesianACC+', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, temperature=None, mcmc_seed=0), multiclass_method
|
||||
# yield 'BayesianACC!', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, temperature=None, mcmc_seed=0), multiclass_method
|
||||
#yield 'BayesianHDy', hdy, hdy_hyper, lambda hyper: PQ(Cls(), val_split=val_split, stan_seed=0, **hyper), only_binary
|
||||
# yield f'BaKDE-Ait-numpyro', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(), kernel='aitchison', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-numpyro', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-scale', kde_gau_scale, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-pca5', kde_gau_pca, band, lambda hyper: BayesianKDEy(Cls(), reduce=5, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-pca5*', kde_gau_pca, band, lambda hyper: BayesianKDEy(Cls(), reduce=5, temperature=None, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-pca10', kde_gau_pca10, band, lambda hyper: BayesianKDEy(Cls(), reduce=10, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-pca10*', kde_gau_pca10, band, lambda hyper: BayesianKDEy(Cls(), reduce=10, temperature=None, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
|
||||
# yield f'BaKDE-Gau-H0', KDEyFresh(Cls(), bandwidth=0.4), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=0.4, kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
|
||||
# yield f'BaKDE-Gau-H1', KDEyFresh(Cls(), bandwidth=1.), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=1., kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
|
||||
# yield f'BaKDE-Gau-H2', KDEyFresh(Cls(), bandwidth=1.5), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=1.5,
|
||||
# kernel='gaussian',
|
||||
# mcmc_seed=0,
|
||||
# engine='numpyro',
|
||||
# **hyper), multiclass_method
|
||||
# yield f'BaKDE-Ait-T*', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method
|
||||
# yield f'BaKDE-Ait-T!', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-T*', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method
|
||||
#yield f'BaKDE-Gau-T!', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method
|
||||
# yield 'BayEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method
|
||||
# yield 'BayEMQ*', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method
|
||||
# yield 'BayEMQ!', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method
|
||||
# yield 'BaEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(**{k.replace('classifier__', ''): v for k, v in hyper.items()}), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method
|
||||
|
||||
# yield 'BaACC!', acc, acc_hyper, lambda hyper: BayesianCC(Cls(**{k.replace('classifier__', ''): v for k, v in hyper.items()}), temperature=None, mcmc_seed=0), multiclass_method
|
||||
# yield 'BaEMQ!', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(**{k.replace('classifier__', ''): v for k, v in hyper.items()}), prior='uniform', temperature=None, exact_train_prev=False), multiclass_method
|
||||
|
||||
|
||||
def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict):
|
||||
with qp.util.temp_seed(0):
|
||||
point_quantifier = cp(point_quantifier)
|
||||
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
|
||||
# model selection
|
||||
if len(grid)>0:
|
||||
train, val_prot = dataset.get_train_valprot_for_modsel()
|
||||
mod_sel = GridSearchQ(
|
||||
model=point_quantifier,
|
||||
param_grid=grid,
|
||||
protocol=val_prot,
|
||||
refit=False,
|
||||
n_jobs=-1,
|
||||
verbose=True
|
||||
).fit(*train.Xy)
|
||||
best_params = mod_sel.best_params_
|
||||
else:
|
||||
best_params = {}
|
||||
|
||||
return best_params
|
||||
|
||||
|
||||
def temperature_calibration(dataset: DatasetHandler, uncertainty_quantifier):
|
||||
temperature = None
|
||||
if hasattr(uncertainty_quantifier, 'temperature'):
|
||||
if uncertainty_quantifier.temperature is None:
|
||||
print('calibrating temperature')
|
||||
train, val_prot = dataset.get_train_valprot_for_modsel()
|
||||
temp_grid=[1., .5, 1.5, 2., 5., 10., 100., 1000.]
|
||||
temperature = temp_calibration(uncertainty_quantifier, train, val_prot, temp_grid=temp_grid, n_jobs=-1, amplitude_threshold=1., criterion='winkler')
|
||||
uncertainty_quantifier.temperature = temperature
|
||||
else:
|
||||
temperature = uncertainty_quantifier.temperature
|
||||
return temperature
|
||||
|
||||
|
||||
def experiment(dataset: DatasetHandler, method: MethodDescriptor, hyper_choice_path: Path):
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
|
||||
# model selection
|
||||
best_hyperparams = qp.util.pickled_resource(
|
||||
hyper_choice_path, model_selection, dataset, method.surrogate_quantifier(), method.hyper_parameters
|
||||
)
|
||||
print(f'{best_hyperparams=}')
|
||||
|
||||
t_init = time()
|
||||
uncertainty_quantifier = method.uncertainty_aware_quantifier(best_hyperparams)
|
||||
temperature = temperature_calibration(dataset, uncertainty_quantifier)
|
||||
training, test_generator = dataset.get_train_testprot_for_eval()
|
||||
uncertainty_quantifier.fit(*training.Xy)
|
||||
tr_time = time() - t_init
|
||||
|
||||
# test
|
||||
train_prevalence = training.prevalence()
|
||||
results = defaultdict(list)
|
||||
pbar = tqdm(enumerate(test_generator()), total=test_generator.total())
|
||||
for i, (sample_X, true_prevalence) in pbar:
|
||||
t_init = time()
|
||||
point_estimate, region = uncertainty_quantifier.predict_conf(sample_X)
|
||||
ttime = time()-t_init
|
||||
|
||||
results['true-prevs'].append(true_prevalence)
|
||||
results['point-estim'].append(point_estimate)
|
||||
results['shift'].append(qp.error.ae(true_prevalence, train_prevalence))
|
||||
results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate))
|
||||
results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate))
|
||||
results['sre'].append(qp.error.sre(prevs_true=true_prevalence, prevs_hat=point_estimate, prevs_train=train_prevalence))
|
||||
results['coverage'].append(region.coverage(true_prevalence))
|
||||
results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000))
|
||||
results['test-time'].append(ttime)
|
||||
results['samples'].append(region.samples)
|
||||
|
||||
pbar.set_description(
|
||||
f'{method.name} '
|
||||
f'MAE={np.mean(results["ae"]):.5f} '
|
||||
f'W={np.mean(results["sre"]):.5f} '
|
||||
f'Cov={np.mean(results["coverage"]):.5f} '
|
||||
f'AMP={np.mean(results["amplitude"]):.5f}'
|
||||
)
|
||||
|
||||
report = {
|
||||
'optim_hyper': best_hyperparams,
|
||||
'train_time': tr_time,
|
||||
'train-prev': train_prevalence,
|
||||
'results': {k:np.asarray(v) for k,v in results.items()},
|
||||
'temperature': temperature
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
result_dir = RESULT_DIR
|
||||
|
||||
for data_handler in [LeQuaHandler]:#, UCIMulticlassHandler, LeQuaHandler, VisualDataHandler, CIFAR100Handler]:
|
||||
for dataset in data_handler.iter():
|
||||
qp.environ['SAMPLE_SIZE'] = dataset.sample_size
|
||||
print(f'dataset={dataset.name}')
|
||||
|
||||
problem_type = 'binary' if dataset.is_binary() else 'multiclass'
|
||||
|
||||
for method in get_experimental_methods():
|
||||
# skip combination?
|
||||
if method.binary_only() and not dataset.is_binary():
|
||||
continue
|
||||
|
||||
result_path = experiment_path(result_dir / problem_type, dataset.name, method.name)
|
||||
hyper_path = experiment_path(result_dir / 'hyperparams' / problem_type, dataset.name, method.surrogate_quantifier_name())
|
||||
|
||||
report = qp.util.pickled_resource(
|
||||
result_path, experiment, dataset, method, hyper_path
|
||||
)
|
||||
|
||||
print(f'dataset={dataset.name}, '
|
||||
f'method={method.name}: '
|
||||
f'mae={report["results"]["ae"].mean():.5f}, '
|
||||
f'W={report["results"]["sre"].mean():.5f}, '
|
||||
f'coverage={report["results"]["coverage"].mean():.5f}, '
|
||||
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,407 @@
|
|||
import pickle
|
||||
from collections import defaultdict
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from joblib import Parallel, delayed
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
import quapy as qp
|
||||
from BayesianKDEy.commons import RESULT_DIR
|
||||
from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, VisualDataHandler, CIFAR100Handler
|
||||
from comparison_group import SelectGreaterThan, SelectByName, SelectSmallerThan
|
||||
from format import FormatModifierSelectColor
|
||||
from quapy.error import dist_aitchison
|
||||
from quapy.method.confidence import ConfidenceIntervals, ConfidenceIntervalsILR, ConfidenceIntervalsCLR
|
||||
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
|
||||
import quapy.functional as F
|
||||
from result_path.src.table import LatexTable
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from itertools import chain
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 2000)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pd.set_option("display.expand_frame_repr", False)
|
||||
pd.set_option("display.precision", 4)
|
||||
pd.set_option("display.float_format", "{:.4f}".format)
|
||||
|
||||
|
||||
# methods = None # show all methods
|
||||
methods = ['BoCC',
|
||||
'BaACC!',
|
||||
'BaEMQ!',
|
||||
'BaKDE-Gau-T!',
|
||||
'BaKDE-Ait-T!',
|
||||
'BaKDE-Ait-T!2'
|
||||
#'BootstrapACC',
|
||||
#'BootstrapHDy',
|
||||
#'BootstrapKDEy',
|
||||
#'BootstrapEMQ'
|
||||
]
|
||||
|
||||
def region_score(true_prev, region: ConfidenceRegionABC):
|
||||
amp = region.montecarlo_proportion(50_000)
|
||||
if true_prev in region:
|
||||
cost = 0
|
||||
else:
|
||||
scale_cost = 1/region.alpha
|
||||
cost = scale_cost * dist_aitchison(true_prev, region.closest_point_in_region(true_prev))
|
||||
return amp + cost
|
||||
|
||||
|
||||
|
||||
def compute_coverage_amplitude(region_constructor, **kwargs):
|
||||
all_samples = results['samples']
|
||||
all_true_prevs = results['true-prevs']
|
||||
|
||||
def process_one(samples, true_prevs):
|
||||
region = region_constructor(samples, **kwargs)
|
||||
if isinstance(region, ConfidenceIntervals) or isinstance(region, ConfidenceIntervalsCLR) or isinstance(region, ConfidenceIntervalsILR):
|
||||
winkler = region.mean_winkler_score(true_prevs)
|
||||
# winkler_e = region.mean_winkler_score(true_prevs, add_ae=True)
|
||||
cov_soft = region.coverage_soft(true_prevs)
|
||||
else:
|
||||
winkler = None
|
||||
# winkler_e = None
|
||||
cov_soft = None
|
||||
return region.coverage(true_prevs), region.montecarlo_proportion(), winkler, cov_soft
|
||||
|
||||
out = Parallel(n_jobs=3)(
|
||||
delayed(process_one)(samples, true_prevs)
|
||||
for samples, true_prevs in tqdm(
|
||||
zip(all_samples, all_true_prevs),
|
||||
total=len(all_samples),
|
||||
desc='constructing ellipses'
|
||||
)
|
||||
)
|
||||
|
||||
# unzip results
|
||||
coverage, amplitude, winkler, cov_soft = zip(*out)
|
||||
return list(coverage), list(amplitude), list(winkler), list(cov_soft)
|
||||
|
||||
|
||||
def update_pickle(report, pickle_path, updated_dict:dict):
|
||||
for k,v in updated_dict.items():
|
||||
report[k]=v
|
||||
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwargs):
|
||||
if f'coverage-{conf_name}' not in report:
|
||||
covs, amps, winkler, cov_soft = compute_coverage_amplitude(conf_region_class, **kwargs)
|
||||
|
||||
update_fields = {
|
||||
f'coverage-{conf_name}': covs,
|
||||
f'amplitude-{conf_name}': amps,
|
||||
f'winkler-{conf_name}': winkler,
|
||||
f'coverage-soft-{conf_name}': cov_soft
|
||||
}
|
||||
|
||||
update_pickle(report, file, update_fields)
|
||||
|
||||
|
||||
def pareto_front(df, x_col, y_col, maximize_y=True, minimize_x=True):
|
||||
"""
|
||||
Returns a boolean mask indicating whether each row is Pareto-optimal.
|
||||
"""
|
||||
X = df[x_col].values
|
||||
Y = df[y_col].values
|
||||
|
||||
is_pareto = np.ones(len(df), dtype=bool)
|
||||
|
||||
for i in range(len(df)):
|
||||
if not is_pareto[i]:
|
||||
continue
|
||||
for j in range(len(df)):
|
||||
if i == j:
|
||||
continue
|
||||
|
||||
better_or_equal_x = X[j] <= X[i] if minimize_x else X[j] >= X[i]
|
||||
better_or_equal_y = Y[j] >= Y[i] if maximize_y else Y[j] <= Y[i]
|
||||
|
||||
strictly_better = (
|
||||
(X[j] < X[i] if minimize_x else X[j] > X[i]) or
|
||||
(Y[j] > Y[i] if maximize_y else Y[j] < Y[i])
|
||||
)
|
||||
|
||||
if better_or_equal_x and better_or_equal_y and strictly_better:
|
||||
is_pareto[i] = False
|
||||
break
|
||||
|
||||
return is_pareto
|
||||
|
||||
def plot_coverage_vs_amplitude(
|
||||
df,
|
||||
coverage_col,
|
||||
amplitude_col="a-CI",
|
||||
method_col="method",
|
||||
dataset_col=None,
|
||||
error_col=None,
|
||||
error_threshold=None,
|
||||
nominal_coverage=0.95,
|
||||
title=None,
|
||||
):
|
||||
df_plot = df.copy()
|
||||
|
||||
# Optional error filtering
|
||||
if error_col is not None and error_threshold is not None:
|
||||
df_plot = df_plot[df_plot[error_col] <= error_threshold]
|
||||
|
||||
# Compute Pareto front
|
||||
pareto_mask = pareto_front(
|
||||
df_plot,
|
||||
x_col=amplitude_col,
|
||||
y_col=coverage_col,
|
||||
maximize_y=True,
|
||||
minimize_x=True
|
||||
)
|
||||
|
||||
plt.figure(figsize=(7, 6))
|
||||
|
||||
# Base scatter
|
||||
sns.scatterplot(
|
||||
data=df_plot,
|
||||
x=amplitude_col,
|
||||
y=coverage_col,
|
||||
hue=method_col,
|
||||
# style=dataset_col,
|
||||
alpha=0.6,
|
||||
s=60,
|
||||
legend=True
|
||||
)
|
||||
|
||||
# Highlight Pareto front
|
||||
plt.scatter(
|
||||
df_plot.loc[pareto_mask, amplitude_col],
|
||||
df_plot.loc[pareto_mask, coverage_col],
|
||||
facecolors='none',
|
||||
edgecolors='black',
|
||||
s=120,
|
||||
linewidths=1.5,
|
||||
label="Pareto front"
|
||||
)
|
||||
|
||||
# Nominal coverage line
|
||||
plt.axhline(
|
||||
nominal_coverage,
|
||||
linestyle="--",
|
||||
color="gray",
|
||||
linewidth=1,
|
||||
label="Nominal coverage"
|
||||
)
|
||||
|
||||
plt.xlabel("Amplitude (fraction of simplex)")
|
||||
plt.ylabel("Coverage")
|
||||
plt.ylim(0, 1.05)
|
||||
|
||||
if title is not None:
|
||||
plt.title(title)
|
||||
|
||||
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
||||
def nicer_method(name:str):
|
||||
replacements = {
|
||||
# 'Bayesian': 'Ba',
|
||||
'Bootstrap': 'Bo',
|
||||
'-numpyro': '',
|
||||
'emcee': 'emc',
|
||||
'-T*': '*',
|
||||
'-T!': '',
|
||||
'!': '',
|
||||
'-Ait': r'$^{(\mathrm{Ait})}$',
|
||||
'-Gau': r'$^{(\mathrm{Gau})}$'
|
||||
}
|
||||
for k, v in replacements.items():
|
||||
name = name.replace(k,v)
|
||||
return name
|
||||
|
||||
|
||||
def nicer_data(name:str):
|
||||
replacements = {
|
||||
'cifar': 'CIFAR',
|
||||
'-l': '',
|
||||
'mnist': 'MNIST',
|
||||
'fashionmnist': 'fashionMNIST',
|
||||
'svhn': 'SVHN',
|
||||
'100coarse': '100(20)',
|
||||
}
|
||||
for k, v in replacements.items():
|
||||
name = name.replace(k, v)
|
||||
return name
|
||||
|
||||
|
||||
base_dir = RESULT_DIR
|
||||
|
||||
table = defaultdict(list)
|
||||
n_classes = {}
|
||||
tr_size = {}
|
||||
tr_prev = {}
|
||||
|
||||
dataset_class = [UCIMulticlassHandler, CIFAR100Handler, VisualDataHandler, LeQuaHandler]
|
||||
dataset_order = []
|
||||
for handler in dataset_class:
|
||||
for dataset in handler.iter():
|
||||
dataset_order.append(dataset.name)
|
||||
train = dataset.get_training()
|
||||
n_classes[dataset.name] = train.n_classes
|
||||
tr_size[dataset.name] = len(train)
|
||||
tr_prev[dataset.name] = F.strprev(train.prevalence())
|
||||
|
||||
|
||||
problem_type = 'multiclass'
|
||||
path = f'./{base_dir}/{problem_type}/*.pkl'
|
||||
|
||||
for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
|
||||
file = Path(file)
|
||||
dataset, method = file.name.replace('.pkl', '').split('__')
|
||||
if (method not in methods) or (dataset not in dataset_order):
|
||||
continue
|
||||
|
||||
report = pickle.load(open(file, 'rb'))
|
||||
results = report['results']
|
||||
n_samples = len(results['ae'])
|
||||
table['method'].extend([nicer_method(method)] * n_samples)
|
||||
table['dataset'].extend([nicer_data(dataset)] * n_samples)
|
||||
table['ae'].extend(results['ae'])
|
||||
table['rae'].extend(results['rae'])
|
||||
# table['c-CI'].extend(results['coverage'])
|
||||
# table['a-CI'].extend(results['amplitude'])
|
||||
|
||||
# update_pickle_with_region(report, file, conf_name='CI-ILR', conf_region_class=ConfidenceIntervalsILR, bonferroni_correction=True)
|
||||
# update_pickle_with_region(report, file, conf_name='CI-CLR', conf_region_class=ConfidenceIntervalsCLR, bonferroni_correction=True)
|
||||
update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True)
|
||||
update_pickle_with_region(report, file, conf_name='CInb', conf_region_class=ConfidenceIntervals, bonferroni_correction=False) # no Bonferroni-correction
|
||||
# update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
|
||||
# update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
|
||||
# update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
|
||||
|
||||
conf_bonferroni = 'CI'
|
||||
conf_name='CInb'
|
||||
table['c-CI'].extend(report[f'coverage-{conf_bonferroni}']) # the true coverage is better measured with Bonferroni-correction
|
||||
table['w-CI'].extend(report[f'winkler-{conf_name}'])
|
||||
table['cs-CI'].extend(report[f'coverage-soft-{conf_name}'])
|
||||
table['a-CI'].extend(report[f'amplitude-{conf_name}'])
|
||||
|
||||
# table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim'])) # not in this paper...
|
||||
table['SRE'].extend(qp.error.sre(results['true-prevs'], results['point-estim'], report['train-prev'], eps=0.001))
|
||||
|
||||
|
||||
|
||||
# remove datasets with more than max_classes classes
|
||||
# max_classes = 25
|
||||
# min_train = 500
|
||||
# ignore_datasets = ['poker_hand', 'hcv']
|
||||
# for data_name, n in n_classes.items():
|
||||
# if n > max_classes:
|
||||
# df = df[df["dataset"] != data_name]
|
||||
# for data_name, n in tr_size.items():
|
||||
# if n < min_train:
|
||||
# df = df[df["dataset"] != data_name]
|
||||
# for data_name, n in tr_size.items():
|
||||
# if data_name in ignore_datasets:
|
||||
# df = df[df["dataset"] != data_name]
|
||||
|
||||
|
||||
|
||||
|
||||
df = pd.DataFrame(table)
|
||||
df['a-CI'] *= 100
|
||||
df['c-CI'] *= 100
|
||||
df['cs-CI'] *= 100
|
||||
|
||||
for region in ['CI']: #, 'CLR', 'ILR', 'CI']:
|
||||
if problem_type == 'binary' and region=='ILR':
|
||||
continue
|
||||
for column in [f'a-{region}', 'ae', 'SRE', f'c-{region}', f'cs-{region}']: # f'w-{region}'
|
||||
pv = pd.pivot_table(
|
||||
df, index='dataset', columns='method', values=column, margins=True
|
||||
)
|
||||
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
|
||||
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
|
||||
#pv['tr-prev'] = pv.index.map(tr_prev)
|
||||
pv = pv.drop(columns=[col for col in pv.columns if col == "All" or col[-1]=='All'])
|
||||
print(f'{problem_type=} {column=}')
|
||||
print(pv)
|
||||
print('-'*80)
|
||||
|
||||
latex = LatexTable.from_dataframe(df, method='method', benchmark='dataset', value=column, name=column)
|
||||
latex.format.configuration.show_std = False
|
||||
#latex.reorder_methods([nicer_method(m) for m in methods])
|
||||
latex.reorder_benchmarks([nicer_data(d) for d in dataset_order])
|
||||
if column in ['ae', 'SRE']:
|
||||
latex.format.configuration.lower_is_better = True
|
||||
latex.format.configuration.stat_test = 'wilcoxon'
|
||||
#latex.format.configuration.stat_test = None
|
||||
# latex.format.configuration.show_std = True
|
||||
|
||||
if column in [f'c-{region}', f'cs-{region}']:
|
||||
latex.format.configuration.lower_is_better = False
|
||||
latex.format.configuration.stat_test = None
|
||||
latex.format.configuration.with_color = False
|
||||
latex.format.configuration.best_in_bold = False
|
||||
latex.format.configuration.with_rank = False
|
||||
latex.format.configuration.mean_prec = 0
|
||||
latex.add_format_modifier(
|
||||
format_modifier=FormatModifierSelectColor(
|
||||
comparison=SelectGreaterThan(reference_selector=89, input_selector=SelectByName())
|
||||
)
|
||||
)
|
||||
if column in [f'a-{region}']:
|
||||
latex.format.configuration.lower_is_better = True
|
||||
latex.format.configuration.stat_test = None
|
||||
latex.format.configuration.with_color = False
|
||||
latex.format.configuration.best_in_bold = False
|
||||
latex.format.configuration.mean_prec = 2
|
||||
latex.add_format_modifier(
|
||||
format_modifier=FormatModifierSelectColor(
|
||||
comparison=SelectSmallerThan(reference_selector=11, input_selector=SelectByName())
|
||||
)
|
||||
)
|
||||
# latex.add_format_modifier(
|
||||
# format_modifier=FormatModifierSelectColor(
|
||||
# comparison=SelectSmallerThan(reference_selector=0.01, input_selector=SelectByName()),
|
||||
# intensity=50
|
||||
# )
|
||||
# )
|
||||
|
||||
latex.format.configuration.resizebox=.5
|
||||
latex.latexPDF(pdf_path=f'./tables/{latex.name}.pdf')
|
||||
|
||||
|
||||
df = df[df['method']!='BaACC']
|
||||
df = df[df['method']!='BaACC*']
|
||||
df = df[df['method']!='BaACC+']
|
||||
df = df[df['method']!='BaKDE-Ait*']
|
||||
df = df[df['method']!='BaKDE-Gau*']
|
||||
df = df[df['method']!='BayEMQ*']
|
||||
grouped = df.groupby(["method", "dataset"])
|
||||
agg = grouped.agg(
|
||||
ae_mean=("ae", "mean"),
|
||||
ae_std=("ae", "std"),
|
||||
sre_mean=("SRE", "mean"),
|
||||
sre_std=("SRE", "std"),
|
||||
coverage_mean=("c-CI", "mean"),
|
||||
coverage_std=("c-CI", "std"),
|
||||
coverage_soft_mean=("cs-CI", "mean"),
|
||||
amplitude_mean=("a-CI", "mean"),
|
||||
amplitude_std=("a-CI", "std"),
|
||||
).reset_index()
|
||||
|
||||
#plot_coverage_vs_amplitude(
|
||||
# agg,
|
||||
# coverage_col="coverage_soft_mean",
|
||||
# amplitude_col="amplitude_mean",
|
||||
# method_col="method",
|
||||
# dataset_col="dataset",
|
||||
# nominal_coverage=0.95,
|
||||
# title="Marginal coverage vs amplitude"
|
||||
#)
|
||||
|
||||
|
||||
#print('RESTITUIR EL WILCOXON')
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
import os.path
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy.commons import KDEyReduce
|
||||
from _bayeisan_kdey import BayesianKDEy
|
||||
from _bayesian_mapls import BayesianMAPLS
|
||||
from commons import experiment_path, KDEyCLR, RESULT_DIR, MockClassifierFromPosteriors, KDEyScaledB, KDEyFresh
|
||||
# import datasets
|
||||
from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, VisualDataHandler, CIFAR100Handler
|
||||
from method.confidence import ConfidenceIntervals
|
||||
from temperature_calibration import temp_calibration
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ, CC
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.data import Dataset
|
||||
from quapy.method.confidence import BayesianCC, AggregativeBootstrap
|
||||
from quapy.method.aggregative import KDEyML, ACC
|
||||
from quapy.protocol import UPP
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
|
||||
|
||||
def methods(data_handler: DatasetHandler):
|
||||
"""
|
||||
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
|
||||
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
|
||||
- quantifier: is the base model (e.g., KDEyML())
|
||||
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
|
||||
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
|
||||
quantifier with optimized hyperparameters
|
||||
"""
|
||||
if isinstance(data_handler, VisualDataHandler):
|
||||
Cls = LogisticRegression
|
||||
cls_hyper = {}
|
||||
else:
|
||||
Cls = LogisticRegression
|
||||
cls_hyper = {'classifier__C': np.logspace(-4, 4, 9), 'classifier__class_weight': ['balanced', None]}
|
||||
kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **cls_hyper}
|
||||
kdey_hyper_larger = {'bandwidth': np.logspace(-1, 0, 10), **cls_hyper}
|
||||
kdey_hyper_clr = {'bandwidth': np.logspace(-2, 2, 10), **cls_hyper}
|
||||
|
||||
# surrogate quantifiers
|
||||
kde_gau_scale = KDEyScaledB(Cls())
|
||||
|
||||
yield 'KDEy-G-exp', kdey_hyper, KDEyML(Cls())
|
||||
# yield 'KDEy-G-exp2', kdey_hyper_larger, KDEyML(Cls())
|
||||
# yield 'KDEy-G-log', kdey_hyper, KDEyML(Cls(), logdensities=True)
|
||||
yield 'KDEy-Ait', kdey_hyper_clr, KDEyCLR(Cls())
|
||||
|
||||
|
||||
def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict):
|
||||
with qp.util.temp_seed(0):
|
||||
if isinstance(point_quantifier, KDEyScaledB) and 'bandwidth' in grid:
|
||||
def scale_bandwidth(bandwidth, n_classes, beta=0.5):
|
||||
return bandwidth * np.power(n_classes, beta)
|
||||
|
||||
n = dataset.get_training().n_classes
|
||||
grid['bandwidth'] = [scale_bandwidth(b, n) for b in grid['bandwidth']]
|
||||
print('bandwidth scaled')
|
||||
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
|
||||
# model selection
|
||||
if len(grid) > 0:
|
||||
train, val_prot = dataset.get_train_valprot_for_modsel()
|
||||
mod_sel = GridSearchQ(
|
||||
model=point_quantifier,
|
||||
param_grid=grid,
|
||||
protocol=val_prot,
|
||||
refit=False,
|
||||
n_jobs=-1,
|
||||
verbose=True
|
||||
).fit(*train.Xy)
|
||||
best_params = mod_sel.best_params_
|
||||
else:
|
||||
best_params = {}
|
||||
|
||||
return best_params
|
||||
|
||||
|
||||
def experiment(dataset: DatasetHandler,
|
||||
point_quantifier: AggregativeQuantifier,
|
||||
method_name: str,
|
||||
grid: dict,
|
||||
hyper_choice_path: Path):
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
# model selection
|
||||
best_hyperparams = qp.util.pickled_resource(
|
||||
hyper_choice_path, model_selection, dataset, cp(point_quantifier), grid
|
||||
)
|
||||
print(f'{best_hyperparams=}')
|
||||
|
||||
t_init = time()
|
||||
training, test_generator = dataset.get_train_testprot_for_eval()
|
||||
point_quantifier.fit(*training.Xy)
|
||||
tr_time = time() - t_init
|
||||
|
||||
# test
|
||||
train_prevalence = training.prevalence()
|
||||
results = defaultdict(list)
|
||||
pbar = tqdm(enumerate(test_generator()), total=test_generator.total())
|
||||
for i, (sample_X, true_prevalence) in pbar:
|
||||
t_init = time()
|
||||
point_estimate = point_quantifier.predict(sample_X)
|
||||
ttime = time() - t_init
|
||||
|
||||
results['true-prevs'].append(true_prevalence)
|
||||
results['point-estim'].append(point_estimate)
|
||||
results['shift'].append(qp.error.ae(true_prevalence, train_prevalence))
|
||||
results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate))
|
||||
results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate))
|
||||
results['sre'].append(qp.error.sre(prevs_true=true_prevalence, prevs_hat=point_estimate, prevs_train=train_prevalence))
|
||||
results['test-time'].append(ttime)
|
||||
|
||||
pbar.set_description(
|
||||
f'{method_name} MAE={np.mean(results["ae"]):.5f} W={np.mean(results["sre"]):.5f}')
|
||||
|
||||
report = {
|
||||
'optim_hyper': best_hyperparams,
|
||||
'train_time': tr_time,
|
||||
'train-prev': train_prevalence,
|
||||
'results': {k: np.asarray(v) for k, v in results.items()},
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
result_dir = Path('results_map')
|
||||
|
||||
reports = defaultdict(list)
|
||||
|
||||
for data_handler in [UCIMulticlassHandler]: # , UCIMulticlassHandler, LeQuaHandler, VisualDataHandler, CIFAR100Handler]:
|
||||
for dataset in data_handler.iter():
|
||||
qp.environ['SAMPLE_SIZE'] = dataset.sample_size
|
||||
# print(f'dataset={dataset.name}')
|
||||
|
||||
problem_type = 'binary' if dataset.is_binary() else 'multiclass'
|
||||
for method_name, hyper_params, quantifier in methods(dataset):
|
||||
|
||||
result_path = experiment_path(result_dir / problem_type, dataset.name, method_name)
|
||||
hyper_path = experiment_path(result_dir / 'hyperparams' / problem_type, dataset.name, method_name)
|
||||
|
||||
# if os.path.exists(result_path):
|
||||
report = qp.util.pickled_resource(
|
||||
result_path, experiment, dataset, quantifier, method_name, hyper_params, hyper_path
|
||||
)
|
||||
reports['dataset'].append(dataset.name)
|
||||
reports['method'].append(method_name)
|
||||
reports['MAE'].append(report["results"]["ae"].mean())
|
||||
reports['SRE'].append(report["results"]["sre"].mean())
|
||||
reports['h'].append(report["optim_hyper"]["bandwidth"])
|
||||
|
||||
print(f'dataset={dataset.name}, '
|
||||
f'method={method_name}: '
|
||||
f'mae={reports["MAE"][-1]:.5f}, '
|
||||
f'W={reports["SRE"][-1]:.5f} '
|
||||
f'h={reports["h"][-1]}')
|
||||
|
||||
pv = pd.DataFrame(reports).pivot_table(values=['MAE', 'SRE', 'h'], index='dataset', columns='method', margins=True)
|
||||
print(pv)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy._bayesian_mapls import BayesianMAPLS
|
||||
from BayesianKDEy.commons import KDEyCLR, KDEyCLR2
|
||||
from quapy.method.aggregative import CC, ACC, EMQ, DMy, KDEyML
|
||||
from quapy.method.base import BaseQuantifier
|
||||
from quapy.method.confidence import AggregativeBootstrap, BayesianCC
|
||||
|
||||
|
||||
def get_experimental_methods():
|
||||
#yield BootsCC()
|
||||
#yield BayACC()
|
||||
#yield BayEMQ()
|
||||
#yield BayKDEyGau()
|
||||
#yield BayKDEyAit()
|
||||
yield BayKDEyAit2()
|
||||
|
||||
|
||||
# commons
|
||||
# ------------------------------------------------------------
|
||||
Cls = LogisticRegression
|
||||
cls_hyper = {
|
||||
'classifier__C': np.logspace(-4,4,9),
|
||||
'classifier__class_weight': ['balanced', None]
|
||||
}
|
||||
hdy_hyper = {'nbins': [3, 4, 5, 8, 9, 10, 12, 14, 16, 32], **cls_hyper}
|
||||
kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **cls_hyper}
|
||||
kdey_hyper_clr = {'bandwidth': np.logspace(-2, 2, 10), **cls_hyper}
|
||||
|
||||
def hyper2cls(hyperparams):
|
||||
return {k.replace('classifier__', ''): v for k, v in hyperparams.items()}
|
||||
|
||||
|
||||
# method descriptor logic
|
||||
# ------------------------------------------------------------
|
||||
class MethodDescriptor(ABC):
|
||||
def __init__(self,
|
||||
name: str,
|
||||
surrogate_quantifier: BaseQuantifier,
|
||||
hyper_parameters: dict,
|
||||
):
|
||||
self.name = name
|
||||
self.surrogate_quantifier_ = surrogate_quantifier
|
||||
self.hyper_parameters = hyper_parameters
|
||||
|
||||
@abstractmethod
|
||||
def binary_only(self): ...
|
||||
|
||||
def surrogate_quantifier(self):
|
||||
return self.surrogate_quantifier_
|
||||
|
||||
def surrogate_quantifier_name(self):
|
||||
return self.surrogate_quantifier_.__class__.__name__
|
||||
|
||||
@abstractmethod
|
||||
def uncertainty_aware_quantifier(self, hyperparameters): ...
|
||||
|
||||
|
||||
class MulticlassMethodDescriptor(MethodDescriptor):
|
||||
|
||||
def binary_only(self):
|
||||
return False
|
||||
|
||||
|
||||
# specific methods definitions
|
||||
# ------------------------------------------------------------
|
||||
# ------------------------------------------------------------
|
||||
|
||||
# Bootstrap approaches:
|
||||
# ------------------------------------------------------------
|
||||
class BootsCC(MulticlassMethodDescriptor):
|
||||
def __init__(self):
|
||||
super().__init__(name='BoCC', surrogate_quantifier=CC(Cls()), hyper_parameters=cls_hyper)
|
||||
|
||||
def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
quantifier = CC(Cls()).set_params(**hyperparameters)
|
||||
return AggregativeBootstrap(quantifier, n_test_samples=1000, random_state=0)
|
||||
|
||||
|
||||
# class BootsACC(MulticlassMethodDescriptor):
|
||||
# def __init__(self):
|
||||
# super().__init__(name='BoACC', surrogate_quantifier=ACC(Cls()), hyper_parameters=cls_hyper)
|
||||
#
|
||||
# def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
# quantifier = ACC(Cls()).set_params(**hyperparameters)
|
||||
# return AggregativeBootstrap(quantifier, n_test_samples=1000, random_state=0)
|
||||
#
|
||||
#
|
||||
# class BootsEMQ(MulticlassMethodDescriptor):
|
||||
# def __init__(self):
|
||||
# super().__init__(name='BoEMQ', surrogate_quantifier=EMQ(Cls(), exact_train_prev=False), hyper_parameters=cls_hyper)
|
||||
#
|
||||
# def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
# quantifier = EMQ(Cls(), exact_train_prev=False).set_params(**hyperparameters)
|
||||
# return AggregativeBootstrap(quantifier, n_test_samples=1000, random_state=0)
|
||||
|
||||
|
||||
# class BootsHDy(MethodDescriptor):
|
||||
# def __init__(self):
|
||||
# super().__init__(name='BoHDy', surrogate_quantifier=DMy(Cls()), hyper_parameters=hdy_hyper)
|
||||
#
|
||||
# def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
# quantifier = DMy(Cls()).set_params(**hyperparameters)
|
||||
# return AggregativeBootstrap(quantifier, n_test_samples=1000, random_state=0)
|
||||
#
|
||||
# def binary_only(self):
|
||||
# return True
|
||||
|
||||
# class BootsKDEy(MulticlassMethodDescriptor):
|
||||
# def __init__(self):
|
||||
# super().__init__(name='BoKDEy', surrogate_quantifier=KDEyML(Cls()), hyper_parameters=kdey_hyper)
|
||||
#
|
||||
# def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
# quantifier = KDEyML(Cls()).set_params(**hyperparameters)
|
||||
# return AggregativeBootstrap(quantifier, n_test_samples=1000, random_state=0)
|
||||
|
||||
|
||||
# Bayesian approaches:
|
||||
# ------------------------------------------------------------
|
||||
class BayACC(MulticlassMethodDescriptor):
|
||||
def __init__(self):
|
||||
super().__init__(name='BaACC!', surrogate_quantifier=ACC(Cls()), hyper_parameters=cls_hyper)
|
||||
|
||||
def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
classifier = Cls(**hyper2cls(hyperparameters))
|
||||
return BayesianCC(classifier, temperature=None, mcmc_seed=0) # is actually a Bayesian variant of ACC
|
||||
|
||||
|
||||
class BayEMQ(MulticlassMethodDescriptor):
|
||||
def __init__(self):
|
||||
super().__init__(name='BaEMQ!', surrogate_quantifier=EMQ(Cls(), exact_train_prev=False), hyper_parameters=cls_hyper)
|
||||
|
||||
def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
classifier = Cls(**hyper2cls(hyperparameters))
|
||||
return BayesianMAPLS(classifier, prior='uniform', temperature=None, exact_train_prev=False)
|
||||
|
||||
|
||||
class BayKDEyGau(MulticlassMethodDescriptor):
|
||||
def __init__(self):
|
||||
kdey_hyper = {'bandwidth': np.logspace(-3, -1, 10), **cls_hyper}
|
||||
super().__init__(name='BaKDE-Gau-T!', surrogate_quantifier=KDEyML(Cls()), hyper_parameters=kdey_hyper)
|
||||
|
||||
def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
return BayesianKDEy(Cls(), kernel='gaussian', temperature=None, mcmc_seed=0, **hyperparameters)
|
||||
|
||||
|
||||
class BayKDEyAit(MulticlassMethodDescriptor):
|
||||
def __init__(self):
|
||||
kdey_hyper = {'bandwidth': np.logspace(-2, 2, 10), **cls_hyper}
|
||||
super().__init__(name='BaKDE-Ait-T!', surrogate_quantifier=KDEyCLR(Cls()), hyper_parameters=kdey_hyper)
|
||||
|
||||
def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
return BayesianKDEy(Cls(), kernel='aitchison', temperature=None, mcmc_seed=0, **hyperparameters)
|
||||
|
||||
|
||||
class BayKDEyAit2(MulticlassMethodDescriptor):
|
||||
def __init__(self):
|
||||
kdey_hyper = {'bandwidth': np.linspace(0.05, 2., 10), **cls_hyper}
|
||||
super().__init__(name='BaKDE-Ait-T!2', surrogate_quantifier=KDEyCLR2(Cls()), hyper_parameters=kdey_hyper)
|
||||
|
||||
def uncertainty_aware_quantifier(self, hyperparameters):
|
||||
return BayesianKDEy(Cls(), kernel='aitchison', temperature=None, mcmc_seed=0, **hyperparameters)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,560 @@
|
|||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
|
||||
from scipy.stats import gaussian_kde
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
|
||||
from BayesianKDEy.commons import antagonistic_prevalence, in_simplex
|
||||
from method.confidence import (ConfidenceIntervals as CI,
|
||||
ConfidenceIntervalsCLR as CICLR,
|
||||
ConfidenceIntervalsILR as CIILR,
|
||||
ConfidenceEllipseSimplex as CE,
|
||||
ConfidenceEllipseCLR as CLR,
|
||||
ConfidenceEllipseILR as ILR)
|
||||
|
||||
|
||||
|
||||
def get_region_colormap(name="blue", alpha=0.40):
|
||||
name = name.lower()
|
||||
if name == "blue":
|
||||
base = (76/255, 114/255, 176/255)
|
||||
elif name == "orange":
|
||||
base = (221/255, 132/255, 82/255)
|
||||
elif name == "violet":
|
||||
base = (129/255, 114/255, 178/255)
|
||||
else:
|
||||
raise ValueError(f"Unknown palette name: {name}")
|
||||
|
||||
cmap = ListedColormap([
|
||||
(1, 1, 1, 0), # 0: transparent white
|
||||
(base[0], base[1], base[2], alpha) # 1: color
|
||||
])
|
||||
|
||||
return cmap
|
||||
|
||||
|
||||
def plot_prev_points(samples=None,
|
||||
show_samples=True,
|
||||
true_prev=None,
|
||||
point_estim=None,
|
||||
train_prev=None,
|
||||
show_mean=True,
|
||||
show_legend=True,
|
||||
region=None,
|
||||
region_resolution=1000,
|
||||
confine_region_in_simplex=False,
|
||||
color='blue',
|
||||
save_path=None):
|
||||
|
||||
plt.rcParams.update({
|
||||
'font.size': 10, # tamaño base de todo el texto
|
||||
'axes.titlesize': 12, # título del eje
|
||||
'axes.labelsize': 10, # etiquetas de ejes
|
||||
'xtick.labelsize': 8, # etiquetas de ticks
|
||||
'ytick.labelsize': 8,
|
||||
'legend.fontsize': 9, # leyenda
|
||||
})
|
||||
|
||||
def cartesian(p):
|
||||
dim = p.shape[-1]
|
||||
p = p.reshape(-1,dim)
|
||||
x = p[:, 1] + p[:, 2] * 0.5
|
||||
y = p[:, 2] * np.sqrt(3) / 2
|
||||
return x, y
|
||||
|
||||
def barycentric_from_xy(x, y):
|
||||
"""
|
||||
Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3).
|
||||
"""
|
||||
p3 = 2 * y / np.sqrt(3)
|
||||
p2 = x - 0.5 * p3
|
||||
p1 = 1 - p2 - p3
|
||||
return np.stack([p1, p2, p3], axis=-1)
|
||||
|
||||
# simplex coordinates
|
||||
v1 = np.array([0, 0])
|
||||
v2 = np.array([1, 0])
|
||||
v3 = np.array([0.5, np.sqrt(3)/2])
|
||||
|
||||
# Plot
|
||||
fig, ax = plt.subplots(figsize=(6, 6))
|
||||
|
||||
if region is not None:
|
||||
if callable(region):
|
||||
region_list = [("region", region)]
|
||||
else:
|
||||
region_list = region # lista de (name, fn)
|
||||
|
||||
if region is not None:
|
||||
# rectangular mesh
|
||||
x_min, x_max = -0.2, 1.2
|
||||
y_min, y_max = -0.2, np.sqrt(3) / 2 + 0.2
|
||||
|
||||
xs = np.linspace(x_min, x_max, region_resolution)
|
||||
ys = np.linspace(y_min, y_max, region_resolution)
|
||||
grid_x, grid_y = np.meshgrid(xs, ys)
|
||||
|
||||
# barycentric
|
||||
pts_bary = barycentric_from_xy(grid_x, grid_y)
|
||||
|
||||
# mask within simplex
|
||||
if confine_region_in_simplex:
|
||||
in_simplex = np.all(pts_bary >= 0, axis=-1)
|
||||
else:
|
||||
in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool)
|
||||
|
||||
# iterate over regions
|
||||
for (rname, rfun) in region_list:
|
||||
mask = np.zeros_like(in_simplex, dtype=float)
|
||||
valid_pts = pts_bary[in_simplex]
|
||||
mask_vals = np.array([float(rfun(p)) for p in valid_pts])
|
||||
mask[in_simplex] = mask_vals
|
||||
|
||||
ax.pcolormesh(
|
||||
xs, ys, mask,
|
||||
shading='auto',
|
||||
cmap=get_region_colormap(color),
|
||||
alpha=0.3,
|
||||
)
|
||||
|
||||
if samples is not None:
|
||||
if show_samples:
|
||||
ax.scatter(*cartesian(samples), s=15, alpha=0.5, edgecolors='none', label='samples', color='black', linewidth=0.5)
|
||||
if show_mean is not None:
|
||||
if isinstance(show_mean, np.ndarray):
|
||||
ax.scatter(*cartesian(show_mean), s=10, alpha=1, label='sample-mean', edgecolors='black')
|
||||
elif show_mean==True and samples is not None:
|
||||
ax.scatter(*cartesian(samples.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black')
|
||||
else:
|
||||
raise ValueError(f'show_mean should either be a boolean (if True, then samples must be provided) or '
|
||||
f'the mean point itself')
|
||||
if true_prev is not None:
|
||||
ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black')
|
||||
if point_estim is not None:
|
||||
ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black')
|
||||
if train_prev is not None:
|
||||
ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black')
|
||||
|
||||
# edges
|
||||
triangle = np.array([v1, v2, v3, v1])
|
||||
ax.plot(triangle[:, 0], triangle[:, 1], color='black')
|
||||
|
||||
# vertex labels
|
||||
ax.text(-0.05, -0.05, "Y=1", ha='right', va='top')
|
||||
ax.text(1.05, -0.05, "Y=2", ha='left', va='top')
|
||||
ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha='center', va='bottom')
|
||||
|
||||
ax.set_aspect('equal')
|
||||
ax.axis('off')
|
||||
if show_legend:
|
||||
plt.legend(
|
||||
loc='center left',
|
||||
bbox_to_anchor=(1.05, 0.5),
|
||||
)
|
||||
plt.tight_layout()
|
||||
if save_path is None:
|
||||
plt.show()
|
||||
else:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path)
|
||||
|
||||
|
||||
def plot_prev_points_matplot(points):
|
||||
|
||||
# project 2D
|
||||
v1 = np.array([0, 0])
|
||||
v2 = np.array([1, 0])
|
||||
v3 = np.array([0.5, np.sqrt(3) / 2])
|
||||
x = points[:, 1] + points[:, 2] * 0.5
|
||||
y = points[:, 2] * np.sqrt(3) / 2
|
||||
|
||||
# kde
|
||||
xy = np.vstack([x, y])
|
||||
kde = gaussian_kde(xy, bw_method=0.25)
|
||||
xmin, xmax = 0, 1
|
||||
ymin, ymax = 0, np.sqrt(3) / 2
|
||||
|
||||
# grid
|
||||
xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
|
||||
positions = np.vstack([xx.ravel(), yy.ravel()])
|
||||
zz = np.reshape(kde(positions).T, xx.shape)
|
||||
|
||||
# mask points in simplex
|
||||
def in_triangle(x, y):
|
||||
return (y >= 0) & (y <= np.sqrt(3) * np.minimum(x, 1 - x))
|
||||
|
||||
mask = in_triangle(xx, yy)
|
||||
zz_masked = np.ma.array(zz, mask=~mask)
|
||||
|
||||
# plot
|
||||
fig, ax = plt.subplots(figsize=(6, 6))
|
||||
ax.imshow(
|
||||
np.rot90(zz_masked),
|
||||
cmap=plt.cm.viridis,
|
||||
extent=[xmin, xmax, ymin, ymax],
|
||||
alpha=0.8,
|
||||
)
|
||||
|
||||
# Bordes del triángulo
|
||||
triangle = np.array([v1, v2, v3, v1])
|
||||
ax.plot(triangle[:, 0], triangle[:, 1], color='black', lw=2)
|
||||
|
||||
# Puntos (opcional)
|
||||
ax.scatter(x, y, s=5, c='white', alpha=0.3)
|
||||
|
||||
# Etiquetas
|
||||
ax.text(-0.05, -0.05, "A (1,0,0)", ha='right', va='top')
|
||||
ax.text(1.05, -0.05, "B (0,1,0)", ha='left', va='top')
|
||||
ax.text(0.5, np.sqrt(3) / 2 + 0.05, "C (0,0,1)", ha='center', va='bottom')
|
||||
|
||||
ax.set_aspect('equal')
|
||||
ax.axis('off')
|
||||
plt.show()
|
||||
|
||||
# -------- new function
|
||||
|
||||
def cartesian(p):
|
||||
dim = p.shape[-1]
|
||||
p = np.atleast_2d(p)
|
||||
x = p[:, 1] + p[:, 2] * 0.5
|
||||
y = p[:, 2] * np.sqrt(3) / 2
|
||||
return x, y
|
||||
|
||||
|
||||
def barycentric_from_xy(x, y):
|
||||
"""
|
||||
Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3).
|
||||
"""
|
||||
p3 = 2 * y / np.sqrt(3)
|
||||
p2 = x - 0.5 * p3
|
||||
p1 = 1 - p2 - p3
|
||||
return np.stack([p1, p2, p3], axis=-1)
|
||||
|
||||
|
||||
def plot_regions(ax, region_layers, resolution, confine):
|
||||
xs = np.linspace(-0.2, 1.2, resolution)
|
||||
ys = np.linspace(-0.2, np.sqrt(3)/2 + 0.2, resolution)
|
||||
grid_x, grid_y = np.meshgrid(xs, ys)
|
||||
|
||||
pts_bary = barycentric_from_xy(grid_x, grid_y)
|
||||
|
||||
if confine:
|
||||
mask_simplex = np.all(pts_bary >= 0, axis=-1)
|
||||
else:
|
||||
mask_simplex = np.ones(grid_x.shape, dtype=bool)
|
||||
|
||||
for region in region_layers:
|
||||
mask = np.zeros_like(mask_simplex, dtype=float)
|
||||
valid_pts = pts_bary[mask_simplex]
|
||||
mask_vals = np.array([float(region["fn"](p)) for p in valid_pts])
|
||||
mask[mask_simplex] = mask_vals
|
||||
|
||||
ax.pcolormesh(
|
||||
xs, ys, mask,
|
||||
shading="auto",
|
||||
cmap=get_region_colormap(region.get("color", "blue")),
|
||||
alpha=region.get("alpha", 0.3),
|
||||
label=region.get("label", None),
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
def plot_points(ax, point_layers):
|
||||
for layer in point_layers:
|
||||
pts = layer["points"]
|
||||
style = layer.get("style", {})
|
||||
ax.scatter(
|
||||
*cartesian(pts),
|
||||
label=layer.get("label", None),
|
||||
**style
|
||||
)
|
||||
|
||||
def plot_density(
|
||||
ax,
|
||||
density_fn,
|
||||
resolution,
|
||||
densecolor="blue",
|
||||
alpha=1,
|
||||
high_contrast=True # maps the density values to [0,1] to enhance visualization with the cmap
|
||||
):
|
||||
xs = np.linspace(-0.2, 1.2, resolution)
|
||||
ys = np.linspace(-0.2, np.sqrt(3)/2 + 0.2, resolution)
|
||||
grid_x, grid_y = np.meshgrid(xs, ys)
|
||||
|
||||
white_to_color = LinearSegmentedColormap.from_list(
|
||||
"white_to_color",
|
||||
["white", densecolor]
|
||||
)
|
||||
|
||||
pts_bary = barycentric_from_xy(grid_x, grid_y)
|
||||
|
||||
density = np.zeros(grid_x.shape)
|
||||
flat_pts = pts_bary.reshape(-1, 3)
|
||||
|
||||
density_vals = np.array(density_fn(flat_pts))
|
||||
#density_vals = np.array([density_fn(p) for p in flat_pts])
|
||||
if high_contrast:
|
||||
min_v, max_v = np.min(density_vals), np.max(density_vals)
|
||||
density_vals = (density_vals - min_v)/(max_v-min_v)
|
||||
density[:] = density_vals.reshape(grid_x.shape)
|
||||
|
||||
|
||||
ax.pcolormesh(
|
||||
xs,
|
||||
ys,
|
||||
density,
|
||||
shading="auto",
|
||||
cmap=white_to_color,
|
||||
alpha=alpha,
|
||||
)
|
||||
|
||||
|
||||
def plot_simplex(
|
||||
point_layers=None,
|
||||
region_layers=None,
|
||||
density_function=None,
|
||||
density_color="#1f77b4", # blue from tab10
|
||||
density_alpha=1,
|
||||
resolution=1000,
|
||||
confine_region_in_simplex=False,
|
||||
show_legend=True,
|
||||
save_path=None,
|
||||
):
|
||||
fig, ax = plt.subplots(figsize=(6, 6))
|
||||
|
||||
if density_function is not None:
|
||||
plot_density(
|
||||
ax,
|
||||
density_function,
|
||||
resolution,
|
||||
density_color,
|
||||
density_alpha,
|
||||
)
|
||||
|
||||
if region_layers:
|
||||
plot_regions(ax, region_layers, resolution, confine_region_in_simplex)
|
||||
|
||||
if point_layers:
|
||||
plot_points(ax, point_layers)
|
||||
|
||||
# simplex edges
|
||||
triangle = np.array([[0,0],[1,0],[0.5,np.sqrt(3)/2],[0,0]])
|
||||
ax.plot(triangle[:,0], triangle[:,1], color="black")
|
||||
|
||||
# labels
|
||||
ax.text(-0.05, -0.05, "Y=1", ha="right", va="top")
|
||||
ax.text(1.05, -0.05, "Y=2", ha="left", va="top")
|
||||
ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha="center", va="bottom")
|
||||
|
||||
ax.set_aspect("equal")
|
||||
ax.axis("off")
|
||||
|
||||
if show_legend:
|
||||
ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5))
|
||||
|
||||
plt.tight_layout()
|
||||
if save_path:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path, bbox_inches="tight")
|
||||
else:
|
||||
plt.show()
|
||||
|
||||
|
||||
def gaussian_kernel(p, mu=np.array([0.6, 0.2, 0.2]), sigma=0.5):
|
||||
return np.exp(-np.sum((p - mu) ** 2) / (2 * sigma ** 2))
|
||||
|
||||
|
||||
def plot_kernels():
|
||||
from quapy.method._kdey import KDEBase
|
||||
|
||||
kernel = 'aitchison'
|
||||
|
||||
def kernel(p, center, bandwidth=0.1, kernel='gaussian', within_simplex=False):
|
||||
assert within_simplex or kernel == 'gaussian', f'cannot compute {kernel=} outside the simplex'
|
||||
X = np.asarray(center).reshape(-1, 3)
|
||||
p = np.asarray(p).reshape(-1, 3)
|
||||
KDE = KDEBase()
|
||||
kde = KDE.get_kde_function(X, bandwidth=bandwidth, kernel=kernel)
|
||||
|
||||
if within_simplex:
|
||||
density = np.zeros(shape=p.shape[0])
|
||||
p_mask = in_simplex(p)
|
||||
density_vals = KDE.pdf(kde, p[p_mask], kernel=kernel)
|
||||
density[p_mask] = density_vals
|
||||
else:
|
||||
density = KDE.pdf(kde, p, kernel=kernel)
|
||||
return density
|
||||
|
||||
plt.rcParams.update({
|
||||
'font.size': 15,
|
||||
'axes.titlesize': 12,
|
||||
'axes.labelsize': 10,
|
||||
'xtick.labelsize': 8,
|
||||
'ytick.labelsize': 8,
|
||||
'legend.fontsize': 9,
|
||||
})
|
||||
|
||||
dot_style = {"color": "red", "alpha": 1, "s": 100, 'linewidth': 1.5, 'edgecolors': "white"}
|
||||
# center = np.asarray([[0.05, 0.03, 0.92],[0.5, 0.4, 0.1]])
|
||||
center = np.asarray([0.33, 0.33, 0.34])
|
||||
point_layer = [
|
||||
{"points": center, "label": "kernels", "style": dot_style},
|
||||
]
|
||||
density_fn = lambda p: kernel(p, center, bandwidth=0.2, kernel='gaussian', within_simplex=False)
|
||||
plot_simplex(point_layers=point_layer, density_function=density_fn, show_legend=False,
|
||||
save_path='./plots/kernels/gaussian_center.png')
|
||||
|
||||
density_fn = lambda p: kernel(p, center, bandwidth=.6, kernel='aitchison', within_simplex=True)
|
||||
plot_simplex(point_layers=point_layer, density_function=density_fn, show_legend=False,
|
||||
save_path='./plots/kernels/aitchison_center.png')
|
||||
|
||||
center = np.asarray([0.05, 0.05, 0.9])
|
||||
point_layer[0]['points'] = center
|
||||
density_fn = lambda p: kernel(p, center, bandwidth=0.2, kernel='gaussian', within_simplex=False)
|
||||
plot_simplex(point_layers=point_layer, density_function=density_fn, show_legend=False,
|
||||
save_path='./plots/kernels/gaussian_vertex_005_005_09.png')
|
||||
|
||||
density_fn = lambda p: kernel(p, center, bandwidth=1, kernel='aitchison', within_simplex=True)
|
||||
plot_simplex(point_layers=point_layer, density_function=density_fn, show_legend=False,
|
||||
save_path='./plots/kernels/aitchison_vertex_005_005_09.png')
|
||||
|
||||
center = np.asarray([0.45, 0.45, 0.1])
|
||||
point_layer[0]['points'] = center
|
||||
density_fn = lambda p: kernel(p, center, bandwidth=0.2, kernel='gaussian', within_simplex=False)
|
||||
plot_simplex(point_layers=point_layer, density_function=density_fn, show_legend=False,
|
||||
save_path='./plots/kernels/gaussian_border_045_045_01.png')
|
||||
|
||||
density_fn = lambda p: kernel(p, center, bandwidth=.7, kernel='aitchison', within_simplex=True)
|
||||
plot_simplex(point_layers=point_layer, density_function=density_fn, show_legend=False,
|
||||
save_path='./plots/kernels/aitchison_border_045_045_01.png')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
np.random.seed(1)
|
||||
|
||||
n = 1000
|
||||
alpha = [15,10,7]
|
||||
prevs = np.random.dirichlet(alpha, size=n)
|
||||
|
||||
def regions():
|
||||
confs = [0.9, 0.95, 0.99]
|
||||
# yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
|
||||
# yield 'CI-CLR', [(f'{int(c * 100)}%', CICLR(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'CI-CLR-b', [(f'{int(c * 100)}%', CICLR(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
|
||||
# yield 'CI-ILR', [(f'{int(c * 100)}%', CIILR(prevs, confidence_level=c).coverage) for c in confs]
|
||||
yield 'CI-ILR-b', [(f'{int(c * 100)}%', CIILR(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
|
||||
# yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'CE-CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'CE-ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs]
|
||||
|
||||
resolution = 1000
|
||||
alpha_str = ','.join([f'{str(i)}' for i in alpha])
|
||||
|
||||
dot_style = {"color": "gray", "alpha": .5, "s": 15, 'linewidth': .25, 'edgecolors': "black"}
|
||||
point_layer = [
|
||||
{"points": prevs, "label": "points", "style": dot_style},
|
||||
]
|
||||
|
||||
for crname, cr in regions():
|
||||
region = [{'fn': fn, 'alpha':.6, 'label':label} for label, fn in cr]
|
||||
|
||||
plot_simplex(point_layers=point_layer, region_layers=region, show_legend=False, resolution=resolution, save_path=f'./plots/regions/{crname}.png')
|
||||
|
||||
# def regions():
|
||||
# confs = [0.99, 0.95, 0.90]
|
||||
# yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
|
||||
# yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs]
|
||||
# yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs]
|
||||
|
||||
# resolution = 100
|
||||
# alpha_str = ','.join([f'{str(i)}' for i in alpha])
|
||||
# region = CI(prevs, confidence_level=.95, bonferroni_correction=True)
|
||||
# p = None # np.asarray([0.1, 0.8, 0.1])
|
||||
# plot_prev_points(prevs,
|
||||
# show_samples=True,
|
||||
# show_mean=None,
|
||||
# # show_mean=prevs.mean(axis=0),
|
||||
# show_legend=False,
|
||||
# # region=[('', region.coverage)],
|
||||
# # region_resolution=resolution,
|
||||
# color='blue',
|
||||
# true_prev=p,
|
||||
# # train_prev=region.closest_point_in_region(p),
|
||||
# save_path=f'./plots/prior_test/uniform.png',
|
||||
# )
|
||||
|
||||
plt.rcParams.update({
|
||||
'font.size': 10,
|
||||
'axes.titlesize': 12,
|
||||
'axes.labelsize': 10,
|
||||
'xtick.labelsize': 8,
|
||||
'ytick.labelsize': 8,
|
||||
'legend.fontsize': 9,
|
||||
})
|
||||
|
||||
n = 1000
|
||||
train_style = {"color": "blue", "alpha": 0.5, "s":15, 'linewidth':0.5, 'edgecolors':None}
|
||||
test_style = {"color": "red", "alpha": 0.5, "s": 15, 'linewidth': 0.5, 'edgecolors': None}
|
||||
|
||||
|
||||
# train_prevs = np.random.dirichlet(alpha=[1, 1, 1], size=n)
|
||||
# test_prevs = np.random.dirichlet(alpha=[1, 1, 1], size=n)
|
||||
# plot_simplex(
|
||||
# point_layers=[
|
||||
# {"points": train_prevs, "label": "train", "style": train_style},
|
||||
# {"points": test_prevs, "label": "test", "style": test_style},
|
||||
# ],
|
||||
# save_path=f'./plots/prior_test/uniform.png'
|
||||
# )
|
||||
#
|
||||
# alpha = [40, 10, 10]
|
||||
# train_prevs = np.random.dirichlet(alpha=alpha, size=n)
|
||||
# test_prevs = np.random.dirichlet(alpha=alpha, size=n)
|
||||
# plot_simplex(
|
||||
# point_layers=[
|
||||
# {"points": train_prevs, "label": "train", "style": train_style},
|
||||
# {"points": test_prevs, "label": "test", "style": test_style},
|
||||
# ],
|
||||
# save_path=f'./plots/prior_test/informative.png'
|
||||
# )
|
||||
|
||||
# train_prevs = np.random.dirichlet(alpha=[8, 1, 1], size=n)
|
||||
# test_prevs = np.random.dirichlet(alpha=[1, 8, 1], size=n)
|
||||
# plot_simplex(
|
||||
# point_layers=[
|
||||
# {"points": train_prevs, "label": "train", "style": train_style},
|
||||
# {"points": test_prevs, "label": "test", "style": test_style},
|
||||
# ],
|
||||
# save_path=f'./plots/prior_test/wrong.png'
|
||||
# )
|
||||
|
||||
# p = 0.6
|
||||
#
|
||||
# K = 3
|
||||
# # alpha = [p] + [(1. - p) / (K - 1)] * (K - 1)
|
||||
# alpha = [0.095, 0.246, 0.658] # connect-4
|
||||
# alpha = np.array(alpha)
|
||||
#
|
||||
#
|
||||
# for c in [50, 500, 5_000]:
|
||||
# alpha_tr = alpha * c
|
||||
# alpha_te = antagonistic_prevalence(alpha, strength=1) * c
|
||||
# train_prevs = np.random.dirichlet(alpha=alpha_tr, size=n)
|
||||
# test_prevs = np.random.dirichlet(alpha=alpha_te, size=n)
|
||||
# plot_simplex(
|
||||
# point_layers=[
|
||||
# {"points": train_prevs, "label": "train", "style": train_style},
|
||||
# {"points": test_prevs, "label": "test", "style": test_style},
|
||||
# ],
|
||||
# save_path=f'./plots/prior_test/concentration_{c}.png'
|
||||
# )
|
||||
|
||||
# plot_kernels()
|
||||
|
|
@ -0,0 +1,297 @@
|
|||
from collections import defaultdict
|
||||
import pandas as pd
|
||||
|
||||
import model_selection
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.temperature_calibration import temp_calibration
|
||||
from commons import *
|
||||
from data import Dataset
|
||||
from protocol import DirichletProtocol
|
||||
from quapy.method.confidence import BayesianCC
|
||||
from quapy.method.aggregative import ACC, AggregativeQuantifier
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
from copy import deepcopy as cp
|
||||
from tqdm import tqdm
|
||||
from full_experiments import model_selection
|
||||
from itertools import chain
|
||||
|
||||
|
||||
def select_imbalanced_datasets(top_m=10):
|
||||
datasets_prevs = []
|
||||
# choose top-m imbalanced datasets
|
||||
for data_name in multiclass['datasets']:
|
||||
data_prev = multiclass['fetch_fn'](data_name).training.prevalence()
|
||||
balance = normalized_entropy(data_prev)
|
||||
datasets_prevs.append((data_name, balance))
|
||||
datasets_prevs.sort(key=lambda x: x[1])
|
||||
data_selected = [data_name for data_name, balance in datasets_prevs[:top_m]]
|
||||
return data_selected
|
||||
|
||||
|
||||
def methods():
|
||||
acc_hyper = {}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
||||
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
|
||||
|
||||
yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0, prior='uniform')
|
||||
yield f'BaKDE-Ait', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, prior='uniform', **hyper)
|
||||
yield f'BaKDE-Ait-T2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0,
|
||||
engine='numpyro', temperature=2.,
|
||||
prior='uniform', **hyper)
|
||||
yield f'BaKDE-Ait-T1', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0,
|
||||
engine='numpyro', temperature=1.,
|
||||
prior='uniform', **hyper)
|
||||
|
||||
|
||||
def run_test(test, alpha_test, alpha_train, concentration, prior_type, bay_quant, train_prev, dataset_name, method_name, results):
|
||||
test_generator = DirichletProtocol(test, alpha=alpha_test, repeats=100, random_state=0)
|
||||
for i, (sample_X, true_prev) in tqdm(enumerate(test_generator()), total=test_generator.total(),
|
||||
desc=f'{method_name} {prior_type} alpha with {concentration=}'):
|
||||
estim_prev, region = bay_quant.predict_conf(sample_X)
|
||||
|
||||
results['dataset'].append(dataset_name)
|
||||
results['method_name'].append(method_name)
|
||||
results['prior-type'].append(prior_type)
|
||||
results['train-prev'].append(train_prev)
|
||||
results['concentration'].append(concentration)
|
||||
results['train-alpha'].append(alpha_train)
|
||||
results['test-alpha'].append(alpha_test)
|
||||
results['true-prevs'].append(true_prev)
|
||||
results['point-estim'].append(estim_prev)
|
||||
results['shift'].append(qp.error.ae(true_prev, train_prev))
|
||||
results['ae'].append(qp.error.ae(prevs_true=true_prev, prevs_hat=estim_prev))
|
||||
results['sre'].append(qp.error.sre(prevs_true=true_prev, prevs_hat=estim_prev, prevs_train=train_prev))
|
||||
results['rae'].append(qp.error.rae(prevs_true=true_prev, prevs_hat=estim_prev))
|
||||
results['coverage'].append(region.coverage(true_prev))
|
||||
results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000))
|
||||
results['samples'].append(region.samples)
|
||||
|
||||
|
||||
|
||||
def experiment(dataset: Dataset,
|
||||
dataset_name: str,
|
||||
point_quantifier: AggregativeQuantifier,
|
||||
grid: dict,
|
||||
bay_constructor,
|
||||
method_name:str,
|
||||
hyper_choice_path: Path):
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
|
||||
training, test = dataset.train_test
|
||||
|
||||
# model selection
|
||||
best_hyperparams = qp.util.pickled_resource(
|
||||
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
|
||||
)
|
||||
|
||||
bay_quant = bay_constructor(best_hyperparams)
|
||||
if hasattr(bay_quant, 'temperature') and bay_quant.temperature is None:
|
||||
train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
|
||||
temperature = temp_calibration(bay_quant, train, val, temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.], n_jobs=-1)
|
||||
bay_quant.temperature = temperature
|
||||
bay_quant.fit(*training.Xy)
|
||||
|
||||
# test
|
||||
train_prev = training.prevalence()
|
||||
results = defaultdict(list)
|
||||
|
||||
for concentration in [50, 500, 5_000]:
|
||||
alpha_train = train_prev * concentration
|
||||
bay_quant.prior = alpha_train
|
||||
|
||||
# informative prior
|
||||
alpha_test_informative = alpha_train
|
||||
prior_type = 'informative'
|
||||
run_test(test, alpha_test_informative, alpha_train, concentration, prior_type, bay_quant, train_prev, dataset_name, method_name, results)
|
||||
|
||||
# informative prior
|
||||
alpha_test_wrong = antagonistic_prevalence(train_prev, strength=0.25) * concentration
|
||||
prior_type = 'wrong'
|
||||
run_test(test, alpha_test_wrong, alpha_train, concentration, prior_type, bay_quant, train_prev, dataset_name, method_name, results)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def concat_reports(reports):
|
||||
final_report = {
|
||||
k: list(chain.from_iterable(report[k] for report in reports))
|
||||
for k in reports[0]
|
||||
}
|
||||
df = pd.DataFrame(final_report)
|
||||
return df
|
||||
|
||||
|
||||
def error_vs_concentration_plot(df, err='ae', save_path=None):
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
sns.set_theme(style="whitegrid", context="paper")
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
|
||||
|
||||
for ax, prior in zip(axes, ['informative', 'wrong']):
|
||||
sub = df[df['prior-type'] == prior]
|
||||
|
||||
sns.lineplot(
|
||||
data=sub,
|
||||
x='concentration',
|
||||
y=err,
|
||||
hue='method_name',
|
||||
marker='o',
|
||||
errorbar='se', # o 'sd'
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_xscale('log')
|
||||
ax.set_title(f'Prior: {prior}')
|
||||
ax.set_xlabel('Concentration')
|
||||
ax.set_ylabel('M'+err.upper())
|
||||
|
||||
plt.tight_layout()
|
||||
if save_path is None:
|
||||
plt.show()
|
||||
else:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path)
|
||||
|
||||
|
||||
def coverage_vs_concentration_plot(df, save_path=None):
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
|
||||
|
||||
for ax, prior in zip(axes, ['informative', 'wrong']):
|
||||
sub = df[df['prior-type'] == prior]
|
||||
|
||||
sns.lineplot(
|
||||
data=sub,
|
||||
x='concentration',
|
||||
y='coverage',
|
||||
hue='method_name',
|
||||
marker='o',
|
||||
errorbar='se',
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_xscale('log')
|
||||
ax.set_ylim(0, 1.05)
|
||||
ax.set_title(f'Prior: {prior}')
|
||||
ax.set_xlabel('Concentration')
|
||||
ax.set_ylabel('Coverage')
|
||||
|
||||
plt.tight_layout()
|
||||
if save_path is None:
|
||||
plt.show()
|
||||
else:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path)
|
||||
|
||||
|
||||
def amplitude_vs_concentration_plot(df, save_path=None):
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
|
||||
|
||||
for ax, prior in zip(axes, ['informative', 'wrong']):
|
||||
sub = df[df['prior-type'] == prior]
|
||||
|
||||
sns.lineplot(
|
||||
data=sub,
|
||||
x='concentration',
|
||||
y='amplitude',
|
||||
hue='method_name',
|
||||
marker='o',
|
||||
errorbar='se',
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_xscale('log')
|
||||
ax.set_title(f'Prior: {prior}')
|
||||
ax.set_xlabel('Concentration')
|
||||
ax.set_ylabel('Amplitude')
|
||||
|
||||
plt.tight_layout()
|
||||
if save_path is None:
|
||||
plt.show()
|
||||
else:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path)
|
||||
|
||||
|
||||
def coverage_vs_amplitude_plot(df, save_path=None):
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
agg = (
|
||||
df
|
||||
.groupby(['prior-type', 'method_name', 'concentration'])
|
||||
.agg(
|
||||
coverage=('coverage', 'mean'),
|
||||
amplitude=('amplitude', 'mean')
|
||||
)
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
|
||||
|
||||
for ax, prior in zip(axes, ['informative', 'wrong']):
|
||||
sub = agg[agg['prior-type'] == prior]
|
||||
|
||||
sns.scatterplot(
|
||||
data=sub,
|
||||
x='amplitude',
|
||||
y='coverage',
|
||||
hue='method_name',
|
||||
style='concentration',
|
||||
s=80,
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_ylim(0, 1.05)
|
||||
ax.set_title(f'Prior: {prior}')
|
||||
ax.set_xlabel('Amplitude')
|
||||
ax.set_ylabel('Coverage')
|
||||
ax.axhline(0.95, linestyle='--', color='gray', alpha=0.7)
|
||||
|
||||
plt.tight_layout()
|
||||
if save_path is None:
|
||||
plt.show()
|
||||
else:
|
||||
os.makedirs(Path(save_path).parent, exist_ok=True)
|
||||
plt.savefig(save_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
result_dir = Path('./results/prior_effect')
|
||||
selected = select_imbalanced_datasets(10)
|
||||
print(f'selected datasets={selected}')
|
||||
qp.environ['SAMPLE_SIZE'] = multiclass['sample_size']
|
||||
reports = []
|
||||
for data_name in selected:
|
||||
data = multiclass['fetch_fn'](data_name)
|
||||
for method_name, surrogate_quant, hyper_params, bay_constructor in methods():
|
||||
result_path = experiment_path(result_dir, data_name, method_name)
|
||||
hyper_path = experiment_path(result_dir/'hyperparams', data_name, surrogate_quant.__class__.__name__)
|
||||
|
||||
print(f'Launching {method_name} in dataset {data_name}')
|
||||
report = qp.util.pickled_resource(
|
||||
result_path, experiment, data, data_name, surrogate_quant, hyper_params, bay_constructor, method_name, hyper_path
|
||||
)
|
||||
reports.append(report)
|
||||
|
||||
# concat all reports as a dataframe
|
||||
df = concat_reports(reports)
|
||||
|
||||
# for data_name in selected:
|
||||
# print(data_name)
|
||||
# df_ = df[df['dataset']==data_name]
|
||||
df_ = df
|
||||
error_vs_concentration_plot(df_, save_path='./plots/prior_effect/error_vs_concentration.pdf')
|
||||
coverage_vs_concentration_plot(df_, save_path='./plots/prior_effect/coverage_vs_concentration.pdf')
|
||||
amplitude_vs_concentration_plot(df_, save_path='./plots/prior_effect/amplitude_vs_concentration.pdf')
|
||||
coverage_vs_amplitude_plot(df_, save_path='./plots/prior_effect/coverage_vs_amplitude.pdf')
|
||||
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
import warnings
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
|
||||
from method.confidence import ConfidenceIntervals
|
||||
from quapy.functional import strprev
|
||||
from quapy.method.aggregative import KDEyML
|
||||
from quapy.protocol import UPP
|
||||
import quapy.functional as F
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from scipy.stats import dirichlet
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
qp.environ["SAMPLE_SIZE"] = 500
|
||||
cls = LogisticRegression()
|
||||
bayes_kdey = BayesianKDEy(cls, bandwidth=.3, kernel='aitchison', mcmc_seed=0)
|
||||
|
||||
datasets = qp.datasets.UCI_BINARY_DATASETS
|
||||
train, test = qp.datasets.fetch_UCIBinaryDataset(datasets[0]).train_test
|
||||
|
||||
# train, test = qp.datasets.fetch_UCIMulticlassDataset('academic-success', standardize=True).train_test
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
print('fitting KDEy')
|
||||
bayes_kdey.fit(*train.Xy)
|
||||
|
||||
shifted = test.sampling(500, *[0.2, 0.8])
|
||||
# shifted = test.sampling(500, *test.prevalence()[::-1])
|
||||
# shifted = test.sampling(500, *F.uniform_prevalence_sampling(train.n_classes))
|
||||
prev_hat = bayes_kdey.predict(shifted.X)
|
||||
mae = qp.error.mae(shifted.prevalence(), prev_hat)
|
||||
print(f'true_prev={strprev(shifted.prevalence())}')
|
||||
print(f'prev_hat={strprev(prev_hat)}, {mae=:.4f}')
|
||||
|
||||
prev_hat, conf_interval = bayes_kdey.predict_conf(shifted.X)
|
||||
|
||||
mae = qp.error.mae(shifted.prevalence(), prev_hat)
|
||||
print(f'mean posterior {strprev(prev_hat)}, {mae=:.4f}')
|
||||
print(f'CI={conf_interval}')
|
||||
print(f'\tcontains true={conf_interval.coverage(true_value=shifted.prevalence())==1}')
|
||||
print(f'\tamplitude={conf_interval.montecarlo_proportion(50_000)*100.:.3f}%')
|
||||
|
||||
if train.n_classes == 3:
|
||||
plot_prev_points(bayes_kdey.prevalence_samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence())
|
||||
# plot_prev_points_matplot(samples)
|
||||
|
||||
# report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True)
|
||||
# print(report.mean(numeric_only=True))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
import os
|
||||
import pickle
|
||||
import warnings
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
||||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.full_experiments import experiment
|
||||
from BayesianKDEy.commons import experiment_path, KDEyCLR
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
|
||||
from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.data import Dataset
|
||||
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
|
||||
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
|
||||
from quapy.functional import strprev
|
||||
from quapy.method.aggregative import KDEyML, ACC
|
||||
from quapy.protocol import UPP
|
||||
import quapy.functional as F
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from scipy.stats import dirichlet
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
from sklearn.base import clone, BaseEstimator
|
||||
from temperature_calibration import temp_calibration
|
||||
|
||||
def methods():
|
||||
"""
|
||||
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
|
||||
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
|
||||
- quantifier: is the base model (e.g., KDEyML())
|
||||
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
|
||||
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
|
||||
quantifier with optimized hyperparameters
|
||||
"""
|
||||
acc_hyper = {}
|
||||
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
||||
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
|
||||
|
||||
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
|
||||
|
||||
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
|
||||
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
|
||||
# for T in [10, 20, 50, 100., 500]:
|
||||
# yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper),
|
||||
|
||||
# yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper),
|
||||
yield f'BaKDE-numpyro', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, engine='numpyro', **hyper)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
binary = {
|
||||
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
|
||||
'sample_size': 500
|
||||
}
|
||||
|
||||
multiclass = {
|
||||
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
|
||||
'sample_size': 1000
|
||||
}
|
||||
|
||||
setup = multiclass
|
||||
# data_name = 'isolet'
|
||||
# data_name = 'academic-success'
|
||||
data_name = 'poker_hand'
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
|
||||
print(f'dataset={data_name}')
|
||||
data = setup['fetch_fn'](data_name)
|
||||
is_binary = data.n_classes==2
|
||||
hyper_subdir = Path('./results') / 'hyperparams' / ('binary' if is_binary else 'multiclass')
|
||||
for method_name, method, hyper_params, withconf_constructor in methods():
|
||||
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
|
||||
report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path)
|
||||
|
||||
print(f'dataset={data_name}, '
|
||||
f'method={method_name}: '
|
||||
f'mae={report["results"]["ae"].mean():.3f}, '
|
||||
f'coverage={report["results"]["coverage"].mean():.5f}, '
|
||||
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
|
||||
|
||||
# best_hyperparams = pickle.load(open(hyper_path, 'rb'))
|
||||
# method = withconf_constructor(best_hyperparams)
|
||||
#
|
||||
# train, val = data.training.split_stratified(train_prop=0.6, random_state=0)
|
||||
# temp_calibration(method, train, val, amplitude_threshold=0.1475, temp_grid=[2., 10., 100., 1000.])#temp_grid=[1., 1.25, 1.5, 1.75, 2.])
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,129 @@
|
|||
import os.path
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.commons import experiment_path
|
||||
from quapy.protocol import UPP
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from joblib import Parallel, delayed
|
||||
from itertools import product
|
||||
|
||||
# is there a correspondence between smaller bandwidths and overconfident likelihoods? if so, a high temperature
|
||||
# after calibration might be correlated; this script aims at analyzing this trend
|
||||
|
||||
datasets = qp.datasets.UCI_MULTICLASS_DATASETS
|
||||
|
||||
def show(results, values):
|
||||
df_res = pd.DataFrame(results)
|
||||
df_mean = (
|
||||
df_res
|
||||
.groupby(['bandwidth', 'temperature'], as_index=False)
|
||||
.mean(numeric_only=True)
|
||||
)
|
||||
pivot = df_mean.pivot(
|
||||
index='bandwidth',
|
||||
columns='temperature',
|
||||
values=values
|
||||
)
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
plt.imshow(pivot, origin='lower', aspect='auto')
|
||||
plt.colorbar(label=values)
|
||||
|
||||
plt.xticks(range(len(pivot.columns)), pivot.columns)
|
||||
plt.yticks(range(len(pivot.index)), pivot.index)
|
||||
|
||||
plt.xlabel('Temperature')
|
||||
plt.ylabel('Bandwidth')
|
||||
plt.title(f'{values} heatmap')
|
||||
|
||||
plt.savefig(f'./plotcorr/{values}.png')
|
||||
plt.cla()
|
||||
plt.clf()
|
||||
|
||||
def run_experiment(
|
||||
bandwidth,
|
||||
temperature,
|
||||
train,
|
||||
test,
|
||||
dataset,
|
||||
):
|
||||
qp.environ['SAMPLE_SIZE'] = 1000
|
||||
bakde = BayesianKDEy(
|
||||
engine='numpyro',
|
||||
bandwidth=bandwidth,
|
||||
temperature=temperature,
|
||||
)
|
||||
bakde.fit(*train.Xy)
|
||||
|
||||
test_generator = UPP(test, repeats=20, random_state=0)
|
||||
|
||||
rows = []
|
||||
|
||||
for i, (sample, prev) in enumerate(
|
||||
tqdm(
|
||||
test_generator(),
|
||||
desc=f"bw={bandwidth}, T={temperature}",
|
||||
total=test_generator.total(),
|
||||
leave=False,
|
||||
)
|
||||
):
|
||||
point_estimate, region = bakde.predict_conf(sample)
|
||||
|
||||
rows.append({
|
||||
"bandwidth": bandwidth,
|
||||
"temperature": temperature,
|
||||
"dataset": dataset,
|
||||
"sample": i,
|
||||
"mae": qp.error.mae(prev, point_estimate),
|
||||
"cov": region.coverage(prev),
|
||||
"amp": region.montecarlo_proportion(n_trials=50_000),
|
||||
})
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
bandwidths = [0.001, 0.005, 0.01, 0.05, 0.1]
|
||||
temperatures = [0.5, 0.75, 1., 2., 5.]
|
||||
|
||||
res_dir = './plotcorr/results'
|
||||
os.makedirs(res_dir, exist_ok=True)
|
||||
|
||||
all_rows = []
|
||||
for i, dataset in enumerate(datasets):
|
||||
if dataset in ['letter', 'isolet']: continue
|
||||
res_path = f'{res_dir}/{dataset}.pkl'
|
||||
if os.path.exists(res_path):
|
||||
print(f'loading results from pickle {res_path}')
|
||||
results_data_rows = pickle.load(open(res_path, 'rb'))
|
||||
else:
|
||||
print(f'{dataset=} [complete={i}/{len(datasets)}]')
|
||||
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
train, test = data.train_test
|
||||
jobs = list(product(bandwidths, temperatures))
|
||||
results_data_rows = Parallel(n_jobs=-1,backend="loky")(
|
||||
delayed(run_experiment)(bw, T, train, test, dataset) for bw, T in jobs
|
||||
)
|
||||
pickle.dump(results_data_rows, open(res_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
all_rows.extend(results_data_rows)
|
||||
|
||||
results = defaultdict(list)
|
||||
for rows in all_rows:
|
||||
for row in rows:
|
||||
for k, v in row.items():
|
||||
results[k].append(v)
|
||||
|
||||
show(results, 'mae')
|
||||
show(results, 'cov')
|
||||
show(results, 'amp')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.confidence import WithConfidenceABC
|
||||
from quapy.protocol import AbstractProtocol
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import quapy as qp
|
||||
from joblib import Parallel, delayed
|
||||
import copy
|
||||
|
||||
|
||||
def temp_calibration(method:WithConfidenceABC,
|
||||
train:LabelledCollection,
|
||||
val_prot:AbstractProtocol,
|
||||
temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.],
|
||||
nominal_coverage=0.95,
|
||||
amplitude_threshold=1.,
|
||||
criterion='winkler',
|
||||
n_jobs=1,
|
||||
verbose=True):
|
||||
|
||||
assert (amplitude_threshold == 'auto' or (isinstance(amplitude_threshold, float)) and amplitude_threshold <= 1.), \
|
||||
f'wrong value for {amplitude_threshold=}, it must either be "auto" or a float <= 1.0.'
|
||||
assert criterion in {'auto', 'winkler'}, f'unknown {criterion=}; valid ones are auto or winkler'
|
||||
|
||||
if amplitude_threshold=='auto':
|
||||
n_classes = train.n_classes
|
||||
amplitude_threshold = .1/np.log(n_classes+1)
|
||||
|
||||
if isinstance(amplitude_threshold, float) and amplitude_threshold > 0.1:
|
||||
print(f'warning: the {amplitude_threshold=} is too large; this may lead to uninformative regions')
|
||||
|
||||
def _evaluate_temperature_job(job_id, temp):
|
||||
# if verbose:
|
||||
# print(f'\tstarting exploration with temperature={temp}...')
|
||||
|
||||
local_method = copy.deepcopy(method)
|
||||
local_method.temperature = temp
|
||||
|
||||
coverage = 0
|
||||
amplitudes = []
|
||||
winklers = []
|
||||
# errs = []
|
||||
|
||||
pbar = tqdm(enumerate(val_prot()), position=job_id, total=val_prot.total(), disable=not verbose)
|
||||
|
||||
for i, (sample, prev) in pbar:
|
||||
point_estim, conf_region = local_method.predict_conf(sample)
|
||||
|
||||
if prev in conf_region:
|
||||
coverage += 1
|
||||
|
||||
amplitudes.append(conf_region.montecarlo_proportion(n_trials=50_000))
|
||||
winkler = None
|
||||
if criterion=='winkler':
|
||||
winkler = conf_region.mean_winkler_score(true_prev=prev, alpha=0.005)
|
||||
winklers.append(winkler)
|
||||
|
||||
# errs.append(qp.error.mae(prev, point_estim))
|
||||
pbar.set_description(
|
||||
f'job={job_id} T={temp}: '
|
||||
f'coverage={coverage/(i+1)*100:.2f}% '
|
||||
f'amplitude={np.mean(amplitudes)*100:.4f}% '
|
||||
+ f'winkler={np.mean(winklers):.4f}%' if criterion=='winkler' else ''
|
||||
)
|
||||
|
||||
mean_coverage = coverage / val_prot.total()
|
||||
mean_amplitude = np.mean(amplitudes)
|
||||
winkler_mean = np.mean(winklers) if criterion=='winkler' else None
|
||||
|
||||
# if verbose:
|
||||
# print(
|
||||
# f'Temperature={temp} got '
|
||||
# f'coverage={mean_coverage*100:.2f}% '
|
||||
# f'amplitude={mean_amplitude*100:.2f}% '
|
||||
# + f'winkler={winkler_mean:.4f}' if criterion == 'winkler' else ''
|
||||
# )
|
||||
|
||||
return temp, mean_coverage, mean_amplitude, winkler_mean
|
||||
|
||||
temp_grid = sorted(temp_grid)
|
||||
method.fit(*train.Xy)
|
||||
|
||||
raw_results = Parallel(n_jobs=n_jobs, backend="loky")(
|
||||
delayed(_evaluate_temperature_job)(job_id, temp)
|
||||
for job_id, temp in tqdm(enumerate(temp_grid), disable=not verbose)
|
||||
)
|
||||
results = [
|
||||
(temp, cov, amp, wink)
|
||||
for temp, cov, amp, wink in raw_results
|
||||
if amp < amplitude_threshold
|
||||
]
|
||||
|
||||
chosen_temperature = 1.
|
||||
if len(results) > 0:
|
||||
if criterion=='winkler':
|
||||
# choose min winkler
|
||||
chosen_temperature, ccov, camp, cwink = min(results, key=lambda x: x[3])
|
||||
else:
|
||||
# choose best coverage (regardless of amplitude), i.e., closest to nominal
|
||||
chosen_temperature, ccov, camp, cwink = min(results, key=lambda x: abs(x[1]-nominal_coverage))
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f'\nChosen_temperature={chosen_temperature:.2f} got '
|
||||
f'coverage={ccov*100:.2f}% '
|
||||
f'amplitude={camp*100:.4f}% '
|
||||
+ f'winkler={cwink:.4f}' if criterion=='winkler' else ''
|
||||
)
|
||||
|
||||
return chosen_temperature
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,7 +1,24 @@
|
|||
Change Log 0.2.0
|
||||
Change Log 0.2.1
|
||||
-----------------
|
||||
|
||||
CLEAN TODO-FILE
|
||||
- Added mechanisms for Temperature Calibration for coverage
|
||||
- Added MAPLS and BayesianEMQ
|
||||
- Added DirichletProtocol, which allows to generate samples according to a parameterized Dirichlet prior.
|
||||
- Improved efficiency of confidence regions coverage functions
|
||||
- Added Precise Quantifier to WithConfidence methods (a Bayesian adaptation of HDy)
|
||||
- Added Temperature parameter to BayesianCC
|
||||
- Improved documentation of confidence regions.
|
||||
- Added ReadMe method by Daniel Hopkins and Gary King
|
||||
- Internal index in LabelledCollection is now "lazy", and is only constructed if required.
|
||||
- Added new error metrics:
|
||||
- dist_aitchison and mean_dist_aitchison as a new error evaluation metric.
|
||||
- squared ratio error.
|
||||
- Improved numerical stability of KDEyML through logsumexp; useful for cases with large number of classes, where
|
||||
densities for small bandwidths may become huge.
|
||||
|
||||
|
||||
Change Log 0.2.0
|
||||
-----------------
|
||||
|
||||
- Base code Refactor:
|
||||
- Removing coupling between LabelledCollection and quantification methods; the fit interface changes:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,56 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from quapy.method.aggregative import EMQ, KDEyML, PACC
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
METHODS = ['PACC',
|
||||
'EMQ',
|
||||
'KDEy-ML',
|
||||
'KDEy-MLA'
|
||||
]
|
||||
|
||||
|
||||
# common hyperparameterss
|
||||
hyper_LR = {
|
||||
'classifier__C': np.logspace(-3, 3, 7),
|
||||
'classifier__class_weight': ['balanced', None]
|
||||
}
|
||||
|
||||
hyper_kde = {
|
||||
'bandwidth': np.linspace(0.001, 0.5, 100)
|
||||
}
|
||||
|
||||
hyper_kde_aitchison = {
|
||||
'bandwidth': np.linspace(0.01, 2, 100)
|
||||
}
|
||||
|
||||
# instances a new quantifier based on a string name
|
||||
def new_method(method, **lr_kwargs):
|
||||
lr = LogisticRegression(**lr_kwargs)
|
||||
|
||||
if method == 'KDEy-ML':
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEyML(lr, kernel='gaussian')
|
||||
elif method == 'KDEy-MLA':
|
||||
param_grid = {**hyper_kde_aitchison, **hyper_LR}
|
||||
quantifier = KDEyML(lr, kernel='aitchison')
|
||||
elif method == 'EMQ':
|
||||
param_grid = hyper_LR
|
||||
quantifier = EMQ(lr)
|
||||
elif method == 'PACC':
|
||||
param_grid = hyper_LR
|
||||
quantifier = PACC(lr)
|
||||
else:
|
||||
raise NotImplementedError('unknown method', method)
|
||||
|
||||
return param_grid, quantifier
|
||||
|
||||
|
||||
def show_results(result_path):
|
||||
df = pd.read_csv(result_path+'.csv', sep='\t')
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
||||
print(pv)
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
import pickle
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.protocol import UPP
|
||||
from commons import METHODS, new_method, show_results
|
||||
from new_table import LatexTable
|
||||
|
||||
|
||||
SEED = 1
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(qp.datasets.UCI_MULTICLASS_DATASETS)
|
||||
for optim in ['mae', 'mrae']:
|
||||
table = LatexTable()
|
||||
result_dir = f'results/ucimulti/{optim}'
|
||||
|
||||
for method in METHODS:
|
||||
print()
|
||||
global_result_path = f'{result_dir}/{method}'
|
||||
print(f'Method\tDataset\tMAE\tMRAE\tKLD')
|
||||
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||
# print(dataset)
|
||||
local_result_path = global_result_path + '_' + dataset
|
||||
if os.path.exists(local_result_path + '.dataframe'):
|
||||
report = pd.read_csv(local_result_path+'.dataframe')
|
||||
print(f'{method}\t{dataset}\t{report[optim].mean():.5f}')
|
||||
table.add(benchmark=dataset, method=method, v=report[optim].values)
|
||||
else:
|
||||
print(dataset, 'not found for method', method)
|
||||
table.latexPDF(f'./tables/{optim}.pdf', landscape=False)
|
||||
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
import pickle
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.protocol import UPP
|
||||
from commons import METHODS, new_method, show_results
|
||||
|
||||
|
||||
SEED = 1
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
qp.environ['N_JOBS'] = -1
|
||||
n_bags_val = 250
|
||||
n_bags_test = 1000
|
||||
for optim in ['mae', 'mrae']:
|
||||
result_dir = f'results/ucimulti/{optim}'
|
||||
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
for method in METHODS:
|
||||
|
||||
print('Init method', method)
|
||||
|
||||
global_result_path = f'{result_dir}/{method}'
|
||||
# show_results(global_result_path)
|
||||
# sys.exit(0)
|
||||
|
||||
if not os.path.exists(global_result_path + '.csv'):
|
||||
with open(global_result_path + '.csv', 'wt') as csv:
|
||||
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||
|
||||
with open(global_result_path + '.csv', 'at') as csv:
|
||||
|
||||
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||
|
||||
print('init', dataset)
|
||||
|
||||
local_result_path = global_result_path + '_' + dataset
|
||||
if os.path.exists(local_result_path + '.dataframe'):
|
||||
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||
report = pd.read_csv(local_result_path+'.dataframe')
|
||||
print(report["mae"].mean())
|
||||
# data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
# csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||
continue
|
||||
|
||||
with qp.util.temp_seed(SEED):
|
||||
|
||||
param_grid, quantifier = new_method(method, max_iter=3000)
|
||||
|
||||
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
|
||||
# model selection
|
||||
train, test = data.train_test
|
||||
train, val = train.split_stratified(random_state=SEED)
|
||||
|
||||
protocol = UPP(val, repeats=n_bags_val)
|
||||
modsel = GridSearchQ(
|
||||
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
|
||||
)
|
||||
|
||||
try:
|
||||
modsel.fit(*train.Xy)
|
||||
|
||||
print(f'best params {modsel.best_params_}')
|
||||
print(f'best score {modsel.best_score_}')
|
||||
pickle.dump(
|
||||
(modsel.best_params_, modsel.best_score_,),
|
||||
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
quantifier = modsel.best_model()
|
||||
except:
|
||||
print('something went wrong... trying to fit the default model')
|
||||
quantifier.fit(*train.Xy)
|
||||
|
||||
|
||||
protocol = UPP(test, repeats=n_bags_test)
|
||||
report = qp.evaluation.evaluation_report(
|
||||
quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
|
||||
)
|
||||
report.to_csv(f'{local_result_path}.dataframe')
|
||||
print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||
csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||
csv.flush()
|
||||
|
||||
show_results(global_result_path)
|
||||
22
TODO.txt
22
TODO.txt
|
|
@ -1,6 +1,11 @@
|
|||
Adapt examples; remaining: example 4-onwards
|
||||
not working: 15 (qunfold)
|
||||
|
||||
Unify ConfidenceIntervalsTransformation with ConfidenceEllipseTransformation
|
||||
|
||||
Unify functionality of withconfidence methods; the predict_conf has a clear similar structure across all
|
||||
variants, and should be unified in the super class
|
||||
|
||||
Solve the warnings issue; right now there is a warning ignore in method/__init__.py:
|
||||
|
||||
Add 'platt' to calib options in EMQ?
|
||||
|
|
@ -46,14 +51,25 @@ Para quitar el labelledcollection de los métodos:
|
|||
- proporción en [0,1]
|
||||
- fit_classifier=False:
|
||||
|
||||
|
||||
|
||||
- [TODO] add RLSbench?:
|
||||
- https://arxiv.org/pdf/2302.03020
|
||||
- https://github.com/acmi-lab/RLSbench
|
||||
- [TODO] check Table shift
|
||||
- https://proceedings.neurips.cc/paper_files/paper/2023/hash/a76a757ed479a1e6a5f8134bea492f83-Abstract-Datasets_and_Benchmarks.html
|
||||
- [TODO] have a look at TorchDrift
|
||||
- https://torchdrift.org/
|
||||
- [TODO] have a look at
|
||||
- https://github.com/SeldonIO/alibi-detect/
|
||||
- [TODO] check if the KDEyML variant with sumlogexp is slower than the original one, or check whether we can explore
|
||||
an unconstrained space in which the parameter is already the log(prev); maybe also move to cvxq
|
||||
- [TODO] why not simplifying the epsilon of RAE? at the end, it is meant to smooth the denominator for avoiding div 0
|
||||
- [TODO] document confidence in manuals
|
||||
- [TODO] Test the return_type="index" in protocols and finish the "distributing_samples.py" example
|
||||
- [TODO] Add EDy (an implementation is available at quantificationlib)
|
||||
- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
|
||||
- [TODO] add HistNetQ
|
||||
- [TODO] add CDE-iteration and Bayes-CDE methods
|
||||
- [TODO] add CDE-iteration (https://github.com/Arctickirillas/Rubrication/blob/master/quantification.py#L593 or
|
||||
Schumacher's code) and Bayes-CDE methods
|
||||
- [TODO] add Friedman's method and DeBias
|
||||
- [TODO] check ignore warning stuff
|
||||
check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings
|
||||
|
|
|
|||
|
|
@ -294,7 +294,7 @@ The datasets correspond to a part of the datasets that can be retrieved from the
|
|||
* containing at least 1,000 instances
|
||||
* can be imported using the Python API.
|
||||
|
||||
Some statistics about these datasets are displayed below :
|
||||
Some statistics about these datasets (after applying default filters) are displayed below :
|
||||
|
||||
| **Dataset** | **classes** | **instances** | **features** | **prevs** | **type** |
|
||||
|:------------|:-----------:|:-------------:|:------------:|:----------|:--------:|
|
||||
|
|
|
|||
|
|
@ -604,7 +604,10 @@ estim_prevalence = model.predict(dataset.test.X)
|
|||
|
||||
_(New in v0.2.0!)_ Some quantification methods go beyond providing a single point estimate of class prevalence values and also produce confidence regions, which characterize the uncertainty around the point estimate. In QuaPy, two such methods are currently implemented:
|
||||
|
||||
* Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. Key features of this method include:
|
||||
* Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. The method is described in the paper [Moreo, A., Salvati, N.
|
||||
An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification.
|
||||
Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025.
|
||||
pp 12-33, Porto (Portugal)](https://lq-2025.github.io/proceedings/CompleteVolume.pdf). Key features of this method include:
|
||||
|
||||
* Optimized Computation: The bootstrap is applied to pre-classified instances, significantly speeding up training and inference.
|
||||
During training, bootstrap repetitions are performed only after training the classifier once. These repetitions are used to train multiple aggregation functions.
|
||||
|
|
|
|||
|
|
@ -60,6 +60,14 @@ quapy.method.composable module
|
|||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
quapy.method.confidence module
|
||||
------------------------------
|
||||
|
||||
.. automodule:: quapy.method.confidence
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Module contents
|
||||
---------------
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ with qp.util.temp_seed(0):
|
|||
true_prev = shifted_test.prevalence()
|
||||
|
||||
# by calling "quantify_conf", we obtain the point estimate and the confidence intervals around it
|
||||
pred_prev, conf_intervals = pacc.quantify_conf(shifted_test.X)
|
||||
pred_prev, conf_intervals = pacc.predict_conf(shifted_test.X)
|
||||
|
||||
# conf_intervals is an instance of ConfidenceRegionABC, which provides some useful utilities like:
|
||||
# - coverage: a function which computes the fraction of true values that belong to the confidence region
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
|
||||
import quapy as qp
|
||||
from quapy.method.non_aggregative import ReadMe
|
||||
import quapy.functional as F
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
"""
|
||||
This example showcases how to use the non-aggregative method ReadMe proposed by Hopkins and King.
|
||||
This method is for text analysis, so let us first instantiate a dataset for sentiment quantification (we
|
||||
use IMDb for this example). The method is quite computationally expensive, so we will restrict the training
|
||||
set to 1000 documents only.
|
||||
"""
|
||||
reviews = qp.datasets.fetch_reviews('imdb').reduce(n_train=1000, random_state=0)
|
||||
|
||||
"""
|
||||
We need to convert text to bag-of-words representations. Actually, ReadMe requires the representations to be
|
||||
binary (i.e., storing a 1 whenever a document contains certain word, or 0 otherwise), so we will not use
|
||||
TFIDF weighting. We will also retain the top 1000 most important features according to chi2.
|
||||
"""
|
||||
encode_0_1 = Pipeline([
|
||||
('0_1_terms', CountVectorizer(min_df=5, binary=True)),
|
||||
('feat_sel', SelectKBest(chi2, k=1000))
|
||||
])
|
||||
train, test = qp.data.preprocessing.instance_transformation(reviews, encode_0_1, inplace=True).train_test
|
||||
|
||||
"""
|
||||
We now instantiate ReadMe, with the prob_model='full' (default behaviour, implementing the Hopkins and King original
|
||||
idea). This method consists of estimating Q(Y) by solving:
|
||||
|
||||
Q(X) = \sum_i Q(X|Y=i) Q(Y=i)
|
||||
|
||||
without resorting to estimating the posteriors Q(Y=i|X), by solving a linear least-squares problem.
|
||||
However, since Q(X) and Q(X|Y=i) are matrices of shape (2^K, 1) and (2^K, n), with K the number of features
|
||||
and n the number of classes, their calculation becomes intractable. ReadMe instead performs bagging (i.e., it
|
||||
samples small sets of features and averages the results) thus reducing K to a few terms. In our example we
|
||||
set K (bagging_range) to 20, and the number of bagging_trials to 100.
|
||||
|
||||
ReadMe also computes confidence intervals via bootstrap. We set the number of bootstrap trials to 100.
|
||||
"""
|
||||
readme = ReadMe(prob_model='full', bootstrap_trials=100, bagging_trials=100, bagging_range=20, random_state=0, verbose=True)
|
||||
readme.fit(*train.Xy) # <- there is actually nothing happening here (only bootstrap resampling); the method is "lazy"
|
||||
# and postpones most of the calculations to the test phase.
|
||||
|
||||
# since the method is slow, we will only test 3 cases with different imbalances
|
||||
few_negatives = [0.25, 0.75]
|
||||
balanced = [0.5, 0.5]
|
||||
few_positives = [0.75, 0.25]
|
||||
|
||||
for test_prev in [few_negatives, balanced, few_positives]:
|
||||
sample = reviews.test.sampling(500, *test_prev, random_state=0) # draw sets of 500 documents with desired prevs
|
||||
prev_estim, conf = readme.predict_conf(sample.X)
|
||||
err = qp.error.mae(sample.prevalence(), prev_estim)
|
||||
print(f'true-prevalence={F.strprev(sample.prevalence())},\n'
|
||||
f'predicted-prevalence={F.strprev(prev_estim)}, with confidence intervals {conf},\n'
|
||||
f'MAE={err:.4f}')
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
from scipy.sparse import csc_matrix, csr_matrix
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
import sklearn
|
||||
import math
|
||||
from scipy.stats import t
|
||||
|
||||
|
||||
class ContTable:
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
|
||||
def __ig_factor(p_tc, p_t, p_c):
|
||||
den = p_t * p_c
|
||||
if den != 0.0 and p_tc != 0:
|
||||
return p_tc * math.log(p_tc / den, 2)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def information_gain(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
|
||||
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
|
||||
|
||||
|
||||
def squared_information_gain(cell):
|
||||
return information_gain(cell)**2
|
||||
|
||||
|
||||
def posneg_information_gain(cell):
|
||||
ig = information_gain(cell)
|
||||
if cell.tpr() < cell.fpr():
|
||||
return -ig
|
||||
else:
|
||||
return ig
|
||||
|
||||
|
||||
def pos_information_gain(cell):
|
||||
if cell.tpr() < cell.fpr():
|
||||
return 0
|
||||
else:
|
||||
return information_gain(cell)
|
||||
|
||||
def pointwise_mutual_information(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
|
||||
|
||||
|
||||
def gss(cell):
|
||||
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
|
||||
|
||||
|
||||
def chi_square(cell):
|
||||
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
|
||||
if den==0.0: return 0.0
|
||||
num = gss(cell)**2
|
||||
return num / den
|
||||
|
||||
|
||||
def conf_interval(xt, n):
|
||||
if n>30:
|
||||
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
|
||||
else:
|
||||
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
|
||||
p = (xt + 0.5 * z2) / (n + z2)
|
||||
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
|
||||
return p, amplitude
|
||||
|
||||
|
||||
def strength(minPosRelFreq, minPos, maxNeg):
|
||||
if minPos > maxNeg:
|
||||
return math.log(2.0 * minPosRelFreq, 2.0)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
|
||||
#however, for some extremely imbalanced dataset caused all documents to be 0
|
||||
def conf_weight(cell, cancel_features=False):
|
||||
c = cell.get_c()
|
||||
not_c = cell.get_not_c()
|
||||
tp = cell.tp
|
||||
fp = cell.fp
|
||||
|
||||
pos_p, pos_amp = conf_interval(tp, c)
|
||||
neg_p, neg_amp = conf_interval(fp, not_c)
|
||||
|
||||
min_pos = pos_p-pos_amp
|
||||
max_neg = neg_p+neg_amp
|
||||
den = (min_pos + max_neg)
|
||||
minpos_relfreq = min_pos / (den if den != 0 else 1)
|
||||
|
||||
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
|
||||
|
||||
if str_tplus == 0 and not cancel_features:
|
||||
return 1e-20
|
||||
|
||||
return str_tplus
|
||||
|
||||
|
||||
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
|
||||
nC = len(cell_matrix)
|
||||
nF = len(cell_matrix[0])
|
||||
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
|
||||
return np.array(tsr_matrix)
|
||||
|
||||
|
||||
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
|
||||
tp_ = len(positive_document_indexes & feature_document_indexes)
|
||||
fp_ = len(feature_document_indexes - positive_document_indexes)
|
||||
fn_ = len(positive_document_indexes - feature_document_indexes)
|
||||
tn_ = nD - (tp_ + fp_ + fn_)
|
||||
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
|
||||
|
||||
|
||||
def category_tables(feature_sets, category_sets, c, nD, nF):
|
||||
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
|
||||
|
||||
|
||||
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
|
||||
"""
|
||||
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
|
||||
Efficiency O(nF x nC x log(S)) where S is the sparse factor
|
||||
"""
|
||||
|
||||
nD, nF = coocurrence_matrix.shape
|
||||
nD2, nC = label_matrix.shape
|
||||
|
||||
if nD != nD2:
|
||||
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
|
||||
(coocurrence_matrix.shape,label_matrix.shape))
|
||||
|
||||
def nonzero_set(matrix, col):
|
||||
return set(matrix[:, col].nonzero()[0])
|
||||
|
||||
if isinstance(coocurrence_matrix, csr_matrix):
|
||||
coocurrence_matrix = csc_matrix(coocurrence_matrix)
|
||||
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
|
||||
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
|
||||
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(
|
||||
delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)
|
||||
)
|
||||
return np.array(cell_matrix)
|
||||
|
||||
|
||||
class TSRweighting(BaseEstimator,TransformerMixin):
|
||||
"""
|
||||
Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain,
|
||||
chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for
|
||||
each category-feature pair.
|
||||
The supervised_4cell_matrix is a `(n_classes, n_words)` matrix containing the 4-cell contingency tables
|
||||
for each class-word pair, and can be pre-computed (e.g., during the feature selection phase) and passed as an
|
||||
argument.
|
||||
When `n_classes>1`, i.e., in multiclass scenarios, a global_policy is used in order to determine a
|
||||
single feature-score which informs about its relevance. Accepted policies include "max" (takes the max score
|
||||
across categories), "ave" and "wave" (take the average, or weighted average, across all categories -- weights
|
||||
correspond to the class prevalence), and "sum" (which sums all category scores).
|
||||
"""
|
||||
|
||||
def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1):
|
||||
if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}')
|
||||
self.tsr_function = tsr_function
|
||||
self.global_policy = global_policy
|
||||
self.supervised_4cell_matrix = supervised_4cell_matrix
|
||||
self.sublinear_tf = sublinear_tf
|
||||
self.norm = norm
|
||||
self.min_df = min_df
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y):
|
||||
self.count_vectorizer = CountVectorizer(min_df=self.min_df)
|
||||
X = self.count_vectorizer.fit_transform(X)
|
||||
|
||||
self.tf_vectorizer = TfidfTransformer(
|
||||
norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf
|
||||
).fit(X)
|
||||
|
||||
if len(y.shape) == 1:
|
||||
y = np.expand_dims(y, axis=1)
|
||||
|
||||
nD, nC = y.shape
|
||||
nF = len(self.tf_vectorizer.get_feature_names_out())
|
||||
|
||||
if self.supervised_4cell_matrix is None:
|
||||
self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs)
|
||||
else:
|
||||
if self.supervised_4cell_matrix.shape != (nC, nF):
|
||||
raise ValueError("Shape of supervised information matrix is inconsistent with X and y")
|
||||
|
||||
tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function)
|
||||
|
||||
if self.global_policy == 'ave':
|
||||
self.global_tsr_vector = np.average(tsr_matrix, axis=0)
|
||||
elif self.global_policy == 'wave':
|
||||
category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)]
|
||||
self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences)
|
||||
elif self.global_policy == 'sum':
|
||||
self.global_tsr_vector = np.sum(tsr_matrix, axis=0)
|
||||
elif self.global_policy == 'max':
|
||||
self.global_tsr_vector = np.amax(tsr_matrix, axis=0)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y):
|
||||
return self.fit(X,y).transform(X)
|
||||
|
||||
def transform(self, X):
|
||||
if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.')
|
||||
X = self.count_vectorizer.transform(X)
|
||||
tf_X = self.tf_vectorizer.transform(X).toarray()
|
||||
weighted_X = np.multiply(tf_X, self.global_tsr_vector)
|
||||
if self.norm is not None and self.norm!='none':
|
||||
weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False)
|
||||
return csr_matrix(weighted_X)
|
||||
|
|
@ -0,0 +1,208 @@
|
|||
from scipy.sparse import issparse
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import quapy as qp
|
||||
from data import LabelledCollection
|
||||
import numpy as np
|
||||
|
||||
from experimental_non_aggregative.custom_vectorizers import *
|
||||
from method._kdey import KDEBase
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import HDy, DistributionMatchingY
|
||||
from quapy.method.base import BaseQuantifier
|
||||
from scipy import optimize
|
||||
import pandas as pd
|
||||
import quapy.functional as F
|
||||
|
||||
|
||||
# TODO: explore the bernoulli (term presence/absence) variant
|
||||
# TODO: explore the multinomial (term frequency) variant
|
||||
# TODO: explore the multinomial + length normalization variant
|
||||
# TODO: consolidate the TSR-variant (e.g., using information gain) variant;
|
||||
# - works better with the idf?
|
||||
# - works better with length normalization?
|
||||
# - etc
|
||||
|
||||
class DxS(BaseQuantifier):
|
||||
def __init__(self, vectorizer=None, divergence='topsoe'):
|
||||
self.vectorizer = vectorizer
|
||||
self.divergence = divergence
|
||||
|
||||
# def __as_distribution(self, instances):
|
||||
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
|
||||
|
||||
def __as_distribution(self, instances):
|
||||
dist = instances.mean(axis=0)
|
||||
return np.asarray(dist).flatten()
|
||||
|
||||
def fit(self, text_instances, labels):
|
||||
|
||||
classes = np.unique(labels)
|
||||
|
||||
if self.vectorizer is not None:
|
||||
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
|
||||
|
||||
distributions = []
|
||||
for class_i in classes:
|
||||
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
|
||||
|
||||
self.validation_distribution = np.asarray(distributions)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, text_instances):
|
||||
if self.vectorizer is not None:
|
||||
text_instances = self.vectorizer.transform(text_instances)
|
||||
|
||||
test_distribution = self.__as_distribution(text_instances)
|
||||
divergence = qp.functional.get_divergence(self.divergence)
|
||||
n_classes, n_feats = self.validation_distribution.shape
|
||||
|
||||
def match(prev):
|
||||
prev = np.expand_dims(prev, axis=0)
|
||||
mixture_distribution = (prev @ self.validation_distribution).flatten()
|
||||
return divergence(test_distribution, mixture_distribution)
|
||||
|
||||
# the initial point is set as the uniform distribution
|
||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||
|
||||
# solutions are bounded to those contained in the unit-simplex
|
||||
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
return r.x
|
||||
|
||||
|
||||
|
||||
class KDExML(BaseQuantifier, KDEBase):
|
||||
|
||||
def __init__(self, bandwidth=0.1, standardize=False):
|
||||
self._check_bandwidth(bandwidth)
|
||||
self.bandwidth = bandwidth
|
||||
self.standardize = standardize
|
||||
|
||||
def fit(self, X, y):
|
||||
classes = sorted(np.unique(y))
|
||||
|
||||
if self.standardize:
|
||||
self.scaler = StandardScaler()
|
||||
X = self.scaler.fit_transform(X)
|
||||
|
||||
if issparse(X):
|
||||
X = X.toarray()
|
||||
|
||||
self.mix_densities = self.get_mixture_components(X, y, classes, self.bandwidth)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
|
||||
of the data (i.e., that minimizes the negative log-likelihood)
|
||||
|
||||
:param X: instances in the sample
|
||||
:return: a vector of class prevalence estimates
|
||||
"""
|
||||
epsilon = 1e-10
|
||||
if issparse(X):
|
||||
X = X.toarray()
|
||||
n_classes = len(self.mix_densities)
|
||||
if self.standardize:
|
||||
X = self.scaler.transform(X)
|
||||
test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities]
|
||||
|
||||
def neg_loglikelihood(prev):
|
||||
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
|
||||
return -np.sum(test_loglikelihood)
|
||||
|
||||
return F.optim_minimize(neg_loglikelihood, n_classes)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 250
|
||||
qp.environ['N_JOBS'] = -1
|
||||
min_df = 10
|
||||
# dataset = 'imdb'
|
||||
repeats = 10
|
||||
error = 'mae'
|
||||
|
||||
div = 'topsoe'
|
||||
|
||||
# generates tuples (dataset, method, method_name)
|
||||
# (the dataset is needed for methods that process the dataset differently)
|
||||
def gen_methods():
|
||||
|
||||
for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS:
|
||||
|
||||
data = qp.datasets.fetch_reviews(dataset, tfidf=False)
|
||||
|
||||
# bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True)
|
||||
# dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer)
|
||||
# yield data, dxs, 'DxS-Bernoulli'
|
||||
#
|
||||
# multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False)
|
||||
# dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer)
|
||||
# yield data, dxs, 'DxS-multinomial'
|
||||
#
|
||||
# tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None)
|
||||
# dxs = DxS(divergence=div, vectorizer=tf_vectorizer)
|
||||
# yield data, dxs, 'DxS-TF'
|
||||
#
|
||||
# logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None)
|
||||
# dxs = DxS(divergence=div, vectorizer=logtf_vectorizer)
|
||||
# yield data, dxs, 'DxS-logTF'
|
||||
#
|
||||
# tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None)
|
||||
# dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
|
||||
# yield data, dxs, 'DxS-TFIDF'
|
||||
#
|
||||
# tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2')
|
||||
# dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
|
||||
# yield data, dxs, 'DxS-TFIDF-l2'
|
||||
|
||||
tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2')
|
||||
dxs = DxS(divergence=div, vectorizer=tsr_vectorizer)
|
||||
yield data, dxs, 'DxS-TFTSR-l2'
|
||||
|
||||
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df)
|
||||
|
||||
kdex = KDExML()
|
||||
reduction = TruncatedSVD(n_components=100, random_state=0)
|
||||
red_data = qp.data.preprocessing.instance_transformation(data, transformer=reduction, inplace=False)
|
||||
yield red_data, kdex, 'KDEx'
|
||||
|
||||
hdy = HDy(LogisticRegression())
|
||||
yield data, hdy, 'HDy'
|
||||
|
||||
# dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5)
|
||||
# yield data, dm, 'DM-5b'
|
||||
#
|
||||
# dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10)
|
||||
# yield data, dm, 'DM-10b'
|
||||
|
||||
|
||||
|
||||
|
||||
result_path = 'results.csv'
|
||||
with open(result_path, 'wt') as csv:
|
||||
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
|
||||
for data, quantifier, quant_name in gen_methods():
|
||||
quantifier.fit(*data.training.Xy)
|
||||
report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True)
|
||||
means = report.mean(numeric_only=True)
|
||||
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
|
||||
|
||||
df = pd.read_csv(result_path, sep='\t')
|
||||
# print(df)
|
||||
|
||||
pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"])
|
||||
print(pv)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -13,7 +13,7 @@ from . import model_selection
|
|||
from . import classification
|
||||
import os
|
||||
|
||||
__version__ = '0.2.0'
|
||||
__version__ = '0.2.1'
|
||||
|
||||
|
||||
def _default_cls():
|
||||
|
|
|
|||
|
|
@ -99,6 +99,9 @@ class SamplesFromDir(AbstractProtocol):
|
|||
sample, _ = self.load_fn(os.path.join(self.path_dir, f'{id}.txt'))
|
||||
yield sample, prevalence
|
||||
|
||||
def total(self):
|
||||
return len(self.true_prevs)
|
||||
|
||||
|
||||
class LabelledCollectionsFromDir(AbstractProtocol):
|
||||
|
||||
|
|
@ -113,6 +116,9 @@ class LabelledCollectionsFromDir(AbstractProtocol):
|
|||
lc = LabelledCollection.load(path=collection_path, loader_func=self.load_fn)
|
||||
yield lc
|
||||
|
||||
def total(self):
|
||||
return len(self.true_prevs)
|
||||
|
||||
|
||||
class ResultSubmission:
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@ class LabelledCollection:
|
|||
else:
|
||||
self.instances = np.asarray(instances)
|
||||
self.labels = np.asarray(labels)
|
||||
n_docs = len(self)
|
||||
if classes is None:
|
||||
self.classes_ = F.classes_from_labels(self.labels)
|
||||
else:
|
||||
|
|
@ -41,7 +40,13 @@ class LabelledCollection:
|
|||
self.classes_.sort()
|
||||
if len(set(self.labels).difference(set(classes))) > 0:
|
||||
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})')
|
||||
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
||||
self._index = None
|
||||
|
||||
@property
|
||||
def index(self):
|
||||
if not hasattr(self, '_index') or self._index is None:
|
||||
self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_}
|
||||
return self._index
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
|
||||
|
|
|
|||
|
|
@ -663,8 +663,8 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
|
|||
:param dataset_name: a dataset name
|
||||
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param min_class_support: minimum number of istances per class. Classes with fewer instances
|
||||
are discarded (deafult is 100)
|
||||
:param min_class_support: integer or float, the minimum number or proportion of istances per class.
|
||||
Classes with fewer instances are discarded (deafult is 100).
|
||||
:param standardize: indicates whether the covariates should be standardized or not (default is True).
|
||||
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
||||
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||
|
|
@ -674,6 +674,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
|
|||
f'UCI Machine Learning datasets repository (multiclass). ' \
|
||||
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
|
||||
|
||||
assert (min_class_support is None or
|
||||
((isinstance(min_class_support, int) and min_class_support>=0) or
|
||||
(isinstance(min_class_support, float) and 0. <= min_class_support < 1.))), \
|
||||
f'invalid value for {min_class_support=}; expected non negative integer or float in [0,1)'
|
||||
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
|
|
@ -739,26 +744,43 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
|
|||
|
||||
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
||||
|
||||
def dummify_categorical_features(df_features, dataset_id):
|
||||
CATEGORICAL_FEATURES = {
|
||||
158: ["S1", "C1", "S2", "C2", "S3", "C3", "S4", "C4", "S5", "C5"], # poker_hand
|
||||
}
|
||||
|
||||
categorical = CATEGORICAL_FEATURES.get(dataset_id, [])
|
||||
|
||||
X = df_features.copy()
|
||||
if categorical:
|
||||
X[categorical] = X[categorical].astype("category")
|
||||
X = pd.get_dummies(X, columns=categorical, drop_first=True)
|
||||
|
||||
return X
|
||||
|
||||
def download(id, name):
|
||||
df = fetch_ucirepo(id=id)
|
||||
|
||||
df.data.features = pd.get_dummies(df.data.features, drop_first=True)
|
||||
X, y = df.data.features.to_numpy(dtype=np.float64), df.data.targets.to_numpy().squeeze()
|
||||
X_df = dummify_categorical_features(df.data.features, id)
|
||||
X = X_df.to_numpy(dtype=np.float64)
|
||||
y = df.data.targets.to_numpy().squeeze()
|
||||
|
||||
assert y.ndim == 1, 'more than one y'
|
||||
assert y.ndim == 1, f'error: the dataset {id=} {name=} has more than one target variable'
|
||||
|
||||
classes = np.sort(np.unique(y))
|
||||
y = np.searchsorted(classes, y)
|
||||
return LabelledCollection(X, y)
|
||||
|
||||
def filter_classes(data: LabelledCollection, min_ipc):
|
||||
if min_ipc is None:
|
||||
min_ipc = 0
|
||||
def filter_classes(data: LabelledCollection, min_class_support):
|
||||
if min_class_support is None or min_class_support == 0.:
|
||||
return data
|
||||
if isinstance(min_class_support, float):
|
||||
min_class_support = int(len(data) * min_class_support)
|
||||
classes = data.classes_
|
||||
# restrict classes to only those with at least min_ipc instances
|
||||
classes = classes[data.counts() >= min_ipc]
|
||||
# restrict classes to only those with at least min_class_support instances
|
||||
classes = classes[data.counts() >= min_class_support]
|
||||
# filter X and y keeping only datapoints belonging to valid classes
|
||||
filter_idx = np.in1d(data.y, classes)
|
||||
filter_idx = np.isin(data.y, classes)
|
||||
X, y = data.X[filter_idx], data.y[filter_idx]
|
||||
# map classes to range(len(classes))
|
||||
y = np.searchsorted(classes, y)
|
||||
|
|
|
|||
|
|
@ -10,6 +10,37 @@ from quapy.util import map_parallel
|
|||
from .base import LabelledCollection
|
||||
|
||||
|
||||
def instance_transformation(dataset:Dataset, transformer, inplace=False):
|
||||
"""
|
||||
Transforms a :class:`quapy.data.base.Dataset` applying the `fit_transform` and `transform` functions
|
||||
of a (sklearn's) transformer.
|
||||
|
||||
:param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
|
||||
lists of str
|
||||
:param transformer: TransformerMixin implementing `fit_transform` and `transform` functions
|
||||
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
|
||||
:return: a new :class:`quapy.data.base.Dataset` with transformed instances (if inplace=False) or a reference to the
|
||||
current Dataset (if inplace=True) where the instances have been transformed
|
||||
"""
|
||||
training_transformed = transformer.fit_transform(*dataset.training.Xy)
|
||||
test_transformed = transformer.transform(dataset.test.X)
|
||||
orig_name = dataset.name
|
||||
|
||||
if inplace:
|
||||
dataset.training = LabelledCollection(training_transformed, dataset.training.labels, dataset.classes_)
|
||||
dataset.test = LabelledCollection(test_transformed, dataset.test.labels, dataset.classes_)
|
||||
if hasattr(transformer, 'vocabulary_'):
|
||||
dataset.vocabulary = transformer.vocabulary_
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(training_transformed, dataset.training.labels.copy(), dataset.classes_)
|
||||
test = LabelledCollection(test_transformed, dataset.test.labels.copy(), dataset.classes_)
|
||||
vocab = None
|
||||
if hasattr(transformer, 'vocabulary_'):
|
||||
vocab = transformer.vocabulary_
|
||||
return Dataset(training, test, vocabulary=vocab, name=orig_name)
|
||||
|
||||
|
||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||
"""
|
||||
Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
|
||||
|
|
@ -29,18 +60,7 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
|
|||
__check_type(dataset.test.instances, np.ndarray, str)
|
||||
|
||||
vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)
|
||||
training_documents = vectorizer.fit_transform(dataset.training.instances)
|
||||
test_documents = vectorizer.transform(dataset.test.instances)
|
||||
|
||||
if inplace:
|
||||
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_)
|
||||
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_)
|
||||
dataset.vocabulary = vectorizer.vocabulary_
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_)
|
||||
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_)
|
||||
return Dataset(training, test, vectorizer.vocabulary_)
|
||||
return instance_transformation(dataset, vectorizer, inplace)
|
||||
|
||||
|
||||
def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
import numpy as np
|
||||
from sklearn.metrics import f1_score
|
||||
import quapy as qp
|
||||
from functional import CLRtransformation
|
||||
|
||||
|
||||
def from_name(err_name):
|
||||
|
|
@ -128,6 +129,63 @@ def se(prevs_true, prevs_hat):
|
|||
return ((prevs_hat - prevs_true) ** 2).mean(axis=-1)
|
||||
|
||||
|
||||
def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
|
||||
"""
|
||||
The squared ratio error is defined as
|
||||
:math:`SRE(p,\\hat{p},p^{tr})=\\frac{1}{|\\mathcal{Y}|}\\sum_{i \\in \\mathcal{Y}}(w_i-\\hat{w}_i)^2`,
|
||||
where
|
||||
:math:`w_i=\\frac{p_i}{p^{tr}_i}=\\frac{Q(Y=i)}{P(Y=i)}`,
|
||||
and `\\hat{w}_i` is the estimate obtained by replacing the true test prior with an estimate, and
|
||||
:math:`\\mathcal{Y}` are the classes of interest
|
||||
:param prevs_true: array-like, true prevalence values
|
||||
:param prevs_hat: array-like, estimated prevalence values
|
||||
:param prevs_train: array-like with the training prevalence values, or a single prevalence vector when
|
||||
all the prevs_true and prevs_hat are computed on the same training set.
|
||||
:param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the
|
||||
training prevalence is expected to be >0 everywhere.
|
||||
:return: the squared ratio error
|
||||
"""
|
||||
prevs_true = np.asarray(prevs_true)
|
||||
prevs_hat = np.asarray(prevs_hat)
|
||||
prevs_train = np.asarray(prevs_train)
|
||||
assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape=} vs {prevs_hat.shape=}'
|
||||
assert prevs_true.shape[-1]==prevs_train.shape[-1], 'wrong shape for training prevalence'
|
||||
if prevs_true.ndim == 2 and prevs_train.ndim == 1:
|
||||
prevs_train = np.tile(prevs_train, reps=(prevs_true.shape[0], 1))
|
||||
if eps>0:
|
||||
prevs_true = smooth(prevs_true, eps)
|
||||
prevs_hat = smooth(prevs_hat, eps)
|
||||
prevs_train = smooth(prevs_train, eps)
|
||||
|
||||
N = prevs_true.shape[-1]
|
||||
w = prevs_true / prevs_train
|
||||
w_hat = prevs_hat / prevs_train
|
||||
return (1./N) * np.sum((w - w_hat)**2., axis=-1)
|
||||
|
||||
|
||||
def msre(prevs_true, prevs_hat, prevs_train, eps=0.):
|
||||
"""
|
||||
Returns the mean :function:`sre` across all experiments.
|
||||
|
||||
:param prevs_true: array-like, true prevalence values of shape (n_experiments, n_classes,) or (n_classes,)
|
||||
:param prevs_hat: array-like, estimated prevalence values of shape equal to prevs_true
|
||||
:param prevs_train: array-like, training prevalence values of (n_experiments, n_classes,) or (n_classes,)
|
||||
:param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the
|
||||
training prevalence is expected to be >0 everywhere.
|
||||
:return: the squared ratio error
|
||||
"""
|
||||
return np.mean(squared_ratio_error(prevs_true, prevs_hat, prevs_train, eps))
|
||||
|
||||
|
||||
def dist_aitchison(prevs_true, prevs_hat):
|
||||
clr = CLRtransformation()
|
||||
return np.linalg.norm(clr(prevs_true) - clr(prevs_hat), axis=-1)
|
||||
|
||||
|
||||
def mean_dist_aitchison(prevs_true, prevs_hat):
|
||||
return np.mean(dist_aitchison(prevs_true, prevs_hat))
|
||||
|
||||
|
||||
def mkld(prevs_true, prevs_hat, eps=None):
|
||||
"""Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the
|
||||
sample pairs. The distributions are smoothed using the `eps` factor
|
||||
|
|
@ -374,8 +432,8 @@ def __check_eps(eps=None):
|
|||
|
||||
|
||||
CLASSIFICATION_ERROR = {f1e, acce}
|
||||
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
|
||||
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
|
||||
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, msre, mean_dist_aitchison}
|
||||
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, sre, dist_aitchison}
|
||||
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
|
||||
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
|
||||
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
|
||||
|
|
@ -387,6 +445,7 @@ ERROR_NAMES = \
|
|||
f1_error = f1e
|
||||
acc_error = acce
|
||||
mean_absolute_error = mae
|
||||
squared_ratio_error = sre
|
||||
absolute_error = ae
|
||||
mean_relative_absolute_error = mrae
|
||||
relative_absolute_error = rae
|
||||
|
|
|
|||
|
|
@ -1,10 +1,15 @@
|
|||
import warnings
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
from functools import lru_cache
|
||||
from typing import Literal, Union, Callable
|
||||
from numpy.typing import ArrayLike
|
||||
|
||||
import scipy
|
||||
import numpy as np
|
||||
from scipy.special import softmax
|
||||
|
||||
import quapy as qp
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
|
@ -277,7 +282,7 @@ def l1_norm(prevalences: ArrayLike) -> np.ndarray:
|
|||
"""
|
||||
n_classes = prevalences.shape[-1]
|
||||
accum = prevalences.sum(axis=-1, keepdims=True)
|
||||
prevalences = np.true_divide(prevalences, accum, where=accum > 0)
|
||||
prevalences = np.true_divide(prevalences, accum, where=accum > 0, out=None)
|
||||
allzeros = accum.flatten() == 0
|
||||
if any(allzeros):
|
||||
if prevalences.ndim == 1:
|
||||
|
|
@ -583,8 +588,8 @@ def solve_adjustment(
|
|||
"""
|
||||
Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of
|
||||
`unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of
|
||||
:math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
|
||||
estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`.
|
||||
:math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
|
||||
estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`.
|
||||
|
||||
:param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate
|
||||
of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j`
|
||||
|
|
@ -649,3 +654,96 @@ def solve_adjustment(
|
|||
raise ValueError(f'unknown {solver=}')
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------------------
|
||||
# Transformations from Compositional analysis
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
class CompositionalTransformation(ABC):
|
||||
"""
|
||||
Abstract class of transformations from compositional data.
|
||||
Basically, callable functions with an "inverse" function.
|
||||
"""
|
||||
@abstractmethod
|
||||
def __call__(self, X): ...
|
||||
|
||||
@abstractmethod
|
||||
def inverse(self, Z): ...
|
||||
|
||||
EPSILON=1e-6
|
||||
|
||||
|
||||
class CLRtransformation(CompositionalTransformation):
|
||||
"""
|
||||
Centered log-ratio (CLR), from compositional analysis
|
||||
"""
|
||||
def __call__(self, X):
|
||||
"""
|
||||
Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
|
||||
actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
|
||||
|
||||
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
|
||||
:param epsilon: small float for prevalence smoothing
|
||||
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
X = qp.error.smooth(X, self.EPSILON)
|
||||
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
|
||||
return np.log(X / G)
|
||||
|
||||
def inverse(self, Z):
|
||||
"""
|
||||
Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
|
||||
|
||||
:param Z: np.ndarray of (n_instances, n_dimensions) to be transformed
|
||||
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
|
||||
"""
|
||||
return scipy.special.softmax(Z, axis=-1)
|
||||
|
||||
|
||||
class ILRtransformation(CompositionalTransformation):
|
||||
"""
|
||||
Isometric log-ratio (ILR), from compositional analysis
|
||||
"""
|
||||
def __call__(self, X):
|
||||
X = np.asarray(X)
|
||||
X = qp.error.smooth(X, self.EPSILON)
|
||||
k = X.shape[-1]
|
||||
V = self.get_V(k) # (k-1, k)
|
||||
logp = np.log(X)
|
||||
return logp @ V.T
|
||||
|
||||
def inverse(self, Z):
|
||||
Z = np.asarray(Z)
|
||||
# get dimension
|
||||
k_minus_1 = Z.shape[-1]
|
||||
k = k_minus_1 + 1
|
||||
V = self.get_V(k) # (k-1, k)
|
||||
logp = Z @ V
|
||||
p = np.exp(logp)
|
||||
p = p / np.sum(p, axis=-1, keepdims=True)
|
||||
return p
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_V(self, k):
|
||||
def helmert_matrix(k):
|
||||
"""
|
||||
Returns the (k x k) Helmert matrix.
|
||||
"""
|
||||
H = np.zeros((k, k))
|
||||
for i in range(1, k):
|
||||
H[i, :i] = 1
|
||||
H[i, i] = -(i)
|
||||
H[i] = H[i] / np.sqrt(i * (i + 1))
|
||||
# row 0 stays zeros; will be discarded
|
||||
return H
|
||||
|
||||
def ilr_basis(k):
|
||||
"""
|
||||
Constructs an orthonormal ILR basis using the Helmert submatrix.
|
||||
Output shape: (k-1, k)
|
||||
"""
|
||||
H = helmert_matrix(k)
|
||||
V = H[1:, :] # remove first row of zeros
|
||||
return V
|
||||
|
||||
return ilr_basis(k)
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ AGGREGATIVE_METHODS = {
|
|||
aggregative.KDEyHD,
|
||||
# aggregative.OneVsAllAggregative,
|
||||
confidence.BayesianCC,
|
||||
confidence.PQ,
|
||||
}
|
||||
|
||||
BINARY_METHODS = {
|
||||
|
|
@ -40,6 +41,7 @@ BINARY_METHODS = {
|
|||
aggregative.MAX,
|
||||
aggregative.MS,
|
||||
aggregative.MS2,
|
||||
confidence.PQ,
|
||||
}
|
||||
|
||||
MULTICLASS_METHODS = {
|
||||
|
|
|
|||
|
|
@ -1,13 +1,21 @@
|
|||
"""
|
||||
Utility functions for `Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ methods.
|
||||
"""
|
||||
import contextlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import importlib.resources
|
||||
|
||||
try:
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
import numpyro
|
||||
import numpyro.distributions as dist
|
||||
import stan
|
||||
import logging
|
||||
import stan.common
|
||||
|
||||
DEPENDENCIES_INSTALLED = True
|
||||
except ImportError:
|
||||
|
|
@ -15,6 +23,7 @@ except ImportError:
|
|||
jnp = None
|
||||
numpyro = None
|
||||
dist = None
|
||||
stan = None
|
||||
|
||||
DEPENDENCIES_INSTALLED = False
|
||||
|
||||
|
|
@ -24,7 +33,7 @@ P_TEST_C: str = "P_test(C)"
|
|||
P_C_COND_Y: str = "P(C|Y)"
|
||||
|
||||
|
||||
def model(n_c_unlabeled: np.ndarray, n_y_and_c_labeled: np.ndarray) -> None:
|
||||
def model_bayesianCC(n_c_unlabeled: np.ndarray, n_y_and_c_labeled: np.ndarray, temperature:float, alpha: np.ndarray) -> None:
|
||||
"""
|
||||
Defines a probabilistic model in `NumPyro <https://num.pyro.ai/>`_.
|
||||
|
||||
|
|
@ -38,21 +47,52 @@ def model(n_c_unlabeled: np.ndarray, n_y_and_c_labeled: np.ndarray) -> None:
|
|||
K = len(n_c_unlabeled)
|
||||
L = len(n_y_labeled)
|
||||
|
||||
pi_ = numpyro.sample(P_TEST_Y, dist.Dirichlet(jnp.ones(L)))
|
||||
pi_ = numpyro.sample(P_TEST_Y, dist.Dirichlet(jnp.asarray(alpha, dtype=jnp.float32)))
|
||||
p_c_cond_y = numpyro.sample(P_C_COND_Y, dist.Dirichlet(jnp.ones(K).repeat(L).reshape(L, K)))
|
||||
|
||||
if temperature==1:
|
||||
# original implementation
|
||||
with numpyro.plate('plate', L):
|
||||
numpyro.sample('F_yc', dist.Multinomial(n_y_labeled, p_c_cond_y), obs=n_y_and_c_labeled)
|
||||
|
||||
p_c = numpyro.deterministic(P_TEST_C, jnp.einsum("yc,y->c", p_c_cond_y, pi_))
|
||||
numpyro.sample('N_c', dist.Multinomial(jnp.sum(n_c_unlabeled), p_c), obs=n_c_unlabeled)
|
||||
else:
|
||||
# with temperature modification
|
||||
with numpyro.plate('plate_y', L):
|
||||
logp_F = dist.Multinomial(
|
||||
n_y_labeled,
|
||||
p_c_cond_y
|
||||
).log_prob(n_y_and_c_labeled)
|
||||
|
||||
numpyro.factor(
|
||||
'F_yc_loglik',
|
||||
jnp.sum(logp_F) / temperature
|
||||
)
|
||||
p_c = numpyro.deterministic(
|
||||
P_TEST_C,
|
||||
jnp.einsum("yc,y->c", p_c_cond_y, pi_)
|
||||
)
|
||||
|
||||
# Likelihood datos no etiquetados
|
||||
logp_N = dist.Multinomial(
|
||||
jnp.sum(n_c_unlabeled),
|
||||
p_c
|
||||
).log_prob(n_c_unlabeled)
|
||||
|
||||
numpyro.factor(
|
||||
'N_c_loglik',
|
||||
logp_N / temperature
|
||||
)
|
||||
|
||||
|
||||
def sample_posterior(
|
||||
def sample_posterior_bayesianCC(
|
||||
n_c_unlabeled: np.ndarray,
|
||||
n_y_and_c_labeled: np.ndarray,
|
||||
num_warmup: int,
|
||||
num_samples: int,
|
||||
alpha: np.ndarray,
|
||||
temperature = 1.,
|
||||
seed: int = 0,
|
||||
) -> dict:
|
||||
"""
|
||||
|
|
@ -65,15 +105,86 @@ def sample_posterior(
|
|||
with entry `(y, c)` being the number of instances labeled as class `y` and predicted as class `c`.
|
||||
:param num_warmup: the number of warmup steps.
|
||||
:param num_samples: the number of samples to draw.
|
||||
:param alpha: a `np.ndarray` of shape `(n_classes,)` with the alpha parameters of the
|
||||
Dirichlet prior
|
||||
:seed: the random seed.
|
||||
:return: a `dict` with the samples. The keys are the names of the latent variables.
|
||||
"""
|
||||
|
||||
mcmc = numpyro.infer.MCMC(
|
||||
numpyro.infer.NUTS(model),
|
||||
numpyro.infer.NUTS(model_bayesianCC),
|
||||
num_warmup=num_warmup,
|
||||
num_samples=num_samples,
|
||||
progress_bar=False
|
||||
)
|
||||
rng_key = jax.random.PRNGKey(seed)
|
||||
mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled)
|
||||
mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled, temperature=temperature, alpha=alpha)
|
||||
return mcmc.get_samples()
|
||||
|
||||
|
||||
|
||||
def load_stan_file():
|
||||
return importlib.resources.files('quapy.method').joinpath('stan/pq.stan').read_text(encoding='utf-8')
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _suppress_stan_logging():
|
||||
with open(os.devnull, "w") as devnull:
|
||||
old_stderr = sys.stderr
|
||||
sys.stderr = devnull
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
sys.stderr = old_stderr
|
||||
|
||||
|
||||
def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed):
|
||||
"""
|
||||
Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification.
|
||||
|
||||
This function builds and samples from a Stan model that implements a bin-based Bayesian
|
||||
quantifier. It uses the class-conditional histograms of the classifier
|
||||
outputs for positive and negative examples, along with the test histogram, to estimate
|
||||
the posterior distribution of prevalence in the test set.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stan_code : str
|
||||
The Stan model code as a string.
|
||||
n_bins : int
|
||||
Number of bins used to build the histograms for positive and negative examples.
|
||||
pos_hist : array-like of shape (n_bins,)
|
||||
Histogram counts of the classifier outputs for the positive class.
|
||||
neg_hist : array-like of shape (n_bins,)
|
||||
Histogram counts of the classifier outputs for the negative class.
|
||||
test_hist : array-like of shape (n_bins,)
|
||||
Histogram counts of the classifier outputs for the test set, binned using the same bins.
|
||||
number_of_samples : int
|
||||
Number of post-warmup samples to draw from the Stan posterior.
|
||||
num_warmup : int
|
||||
Number of warmup iterations for the sampler.
|
||||
stan_seed : int
|
||||
Random seed for Stan model compilation and sampling, ensuring reproducibility.
|
||||
|
||||
Returns
|
||||
-------
|
||||
prev_samples : numpy.ndarray
|
||||
An array of posterior samples of the prevalence (`prev`) in the test set.
|
||||
Each element corresponds to one draw from the posterior distribution.
|
||||
"""
|
||||
|
||||
logging.getLogger("stan.common").setLevel(logging.ERROR)
|
||||
|
||||
stan_data = {
|
||||
'n_bucket': n_bins,
|
||||
'train_neg': neg_hist.tolist(),
|
||||
'train_pos': pos_hist.tolist(),
|
||||
'test': test_hist.tolist(),
|
||||
'posterior': 1
|
||||
}
|
||||
|
||||
with _suppress_stan_logging():
|
||||
stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed)
|
||||
fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup)
|
||||
|
||||
return fit['prev']
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from sklearn.neighbors import KernelDensity
|
|||
import quapy as qp
|
||||
from quapy.method.aggregative import AggregativeSoftQuantifier
|
||||
import quapy.functional as F
|
||||
|
||||
from scipy.special import logsumexp
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
|
||||
|
||||
|
|
@ -15,9 +15,11 @@ class KDEBase:
|
|||
"""
|
||||
|
||||
BANDWIDTH_METHOD = ['scott', 'silverman']
|
||||
KERNELS = ['gaussian', 'aitchison', 'ilr']
|
||||
|
||||
|
||||
@classmethod
|
||||
def _check_bandwidth(cls, bandwidth):
|
||||
def _check_bandwidth(cls, bandwidth, kernel):
|
||||
"""
|
||||
Checks that the bandwidth parameter is correct
|
||||
|
||||
|
|
@ -27,32 +29,61 @@ class KDEBase:
|
|||
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
|
||||
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
||||
if isinstance(bandwidth, float):
|
||||
assert 0 < bandwidth < 1, \
|
||||
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
|
||||
#assert kernel!='gaussian' or (0 < bandwidth < 1), \
|
||||
# ("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), "
|
||||
# "since this method models the unit simplex")
|
||||
pass
|
||||
return bandwidth
|
||||
|
||||
def get_kde_function(self, X, bandwidth):
|
||||
@classmethod
|
||||
def _check_kernel(cls, kernel):
|
||||
"""
|
||||
Checks that the kernel parameter is correct
|
||||
|
||||
:param kernel: str
|
||||
:return: the validated kernel
|
||||
"""
|
||||
assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
|
||||
return kernel
|
||||
|
||||
def get_kde_function(self, X, bandwidth, kernel):
|
||||
"""
|
||||
Wraps the KDE function from scikit-learn.
|
||||
|
||||
:param X: data for which the density function is to be estimated
|
||||
:param bandwidth: the bandwidth of the kernel
|
||||
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||
:return: a scikit-learn's KernelDensity object
|
||||
"""
|
||||
if kernel == 'aitchison':
|
||||
X = self.clr_transform(X)
|
||||
elif kernel == 'ilr':
|
||||
X = self.ilr_transform(X)
|
||||
|
||||
return KernelDensity(bandwidth=bandwidth).fit(X)
|
||||
|
||||
def pdf(self, kde, X):
|
||||
def pdf(self, kde, X, kernel, log_densities=False):
|
||||
"""
|
||||
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
|
||||
function returns :math:`e^{s}`
|
||||
|
||||
:param kde: a previously fit KDE function
|
||||
:param X: the data for which the density is to be estimated
|
||||
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||
:return: np.ndarray with the densities
|
||||
"""
|
||||
return np.exp(kde.score_samples(X))
|
||||
if kernel == 'aitchison':
|
||||
X = self.clr_transform(X)
|
||||
elif kernel == 'ilr':
|
||||
X = self.ilr_transform(X)
|
||||
|
||||
def get_mixture_components(self, X, y, classes, bandwidth):
|
||||
log_density = kde.score_samples(X)
|
||||
if log_densities:
|
||||
return log_density
|
||||
else:
|
||||
return np.exp(log_density)
|
||||
|
||||
def get_mixture_components(self, X, y, classes, bandwidth, kernel):
|
||||
"""
|
||||
Returns an array containing the mixture components, i.e., the KDE functions for each class.
|
||||
|
||||
|
|
@ -60,15 +91,29 @@ class KDEBase:
|
|||
:param y: the class labels
|
||||
:param n_classes: integer, the number of classes
|
||||
:param bandwidth: float, the bandwidth of the kernel
|
||||
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
|
||||
"""
|
||||
class_cond_X = []
|
||||
for cat in classes:
|
||||
selX = X[y==cat]
|
||||
if selX.size==0:
|
||||
print(f'WARNING: empty class {cat}')
|
||||
raise ValueError(f'WARNING: empty class {cat}')
|
||||
selX = [F.uniform_prevalence(len(classes))]
|
||||
|
||||
class_cond_X.append(selX)
|
||||
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
|
||||
return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]
|
||||
|
||||
def clr_transform(self, X):
|
||||
if not hasattr(self, 'clr'):
|
||||
self.clr = F.CLRtransformation()
|
||||
return self.clr(X)
|
||||
|
||||
def ilr_transform(self, X):
|
||||
if not hasattr(self, 'ilr'):
|
||||
self.ilr = F.ILRtransformation()
|
||||
return self.ilr(X)
|
||||
|
||||
|
||||
class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
||||
|
|
@ -107,17 +152,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
|||
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
||||
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
|
||||
:param bandwidth: float, the bandwidth of the Kernel
|
||||
:param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
|
||||
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
|
||||
random_state=None):
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
|
||||
self.kernel = self._check_kernel(kernel)
|
||||
self.random_state=random_state
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
|
||||
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
|
||||
return self
|
||||
|
||||
def aggregate(self, posteriors: np.ndarray):
|
||||
|
|
@ -129,12 +176,24 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
|||
:return: a vector of class prevalence estimates
|
||||
"""
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
epsilon = 1e-10
|
||||
epsilon = 1e-12
|
||||
n_classes = len(self.mix_densities)
|
||||
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
|
||||
if n_classes>=30:
|
||||
# new version: improves numerical stability with logsumexp, at the cost of optimization efficiency.
|
||||
# needed if the number of classes is large (approx >= 30) because densities tend to grow exponentially
|
||||
test_log_densities = [self.pdf(kde_i, posteriors, self.kernel, log_densities=True) for kde_i in self.mix_densities]
|
||||
|
||||
def neg_loglikelihood(prev):
|
||||
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||
prev = np.clip(prev, epsilon, 1.0)
|
||||
test_loglikelihood = logsumexp(np.log(prev)[:,None] + test_log_densities, axis=0)
|
||||
return -np.sum(test_loglikelihood)
|
||||
else:
|
||||
# original implementation
|
||||
test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
|
||||
|
||||
def neg_loglikelihood(prev):
|
||||
# prev = np.clip(prev, epsilon, 1.0)
|
||||
test_mixture_likelihood = prev @ test_densities
|
||||
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
|
||||
return -np.sum(test_loglikelihood)
|
||||
|
||||
|
|
@ -191,7 +250,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
|
|||
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.divergence = divergence
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
|
||||
self.random_state=random_state
|
||||
self.montecarlo_trials = montecarlo_trials
|
||||
|
||||
|
|
@ -278,7 +337,7 @@ class KDEyCS(AggregativeSoftQuantifier):
|
|||
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1):
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
|
||||
|
||||
def gram_matrix_mix_sum(self, X, Y=None):
|
||||
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||
|
|
|
|||
|
|
@ -163,6 +163,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
|||
|
||||
:param X: array-like of shape `(n_samples, n_features)`, the training instances
|
||||
:param y: array-like of shape `(n_samples,)`, the labels
|
||||
:return: a tuple (predictions, labels)
|
||||
"""
|
||||
self._check_classifier(adapt_if_necessary=self.fit_classifier)
|
||||
|
||||
|
|
@ -673,7 +674,7 @@ class PACC(AggregativeSoftQuantifier):
|
|||
class EMQ(AggregativeSoftQuantifier):
|
||||
"""
|
||||
`Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ),
|
||||
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm.
|
||||
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm, or `Maximum Likelihood Label Shif` (MLLS).
|
||||
EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior
|
||||
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
|
||||
maximum-likelihood estimation, in a mutually recursive way, until convergence.
|
||||
|
|
|
|||
|
|
@ -1,19 +1,24 @@
|
|||
from numbers import Number
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from quapy.functional import CompositionalTransformation, CLRtransformation, ILRtransformation
|
||||
from quapy.method import _bayesian
|
||||
from quapy.method.aggregative import AggregativeCrispQuantifier
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import AggregativeQuantifier
|
||||
from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier
|
||||
from scipy.stats import chi2
|
||||
from sklearn.utils import resample
|
||||
from abc import ABC, abstractmethod
|
||||
from scipy.special import softmax, factorial
|
||||
from scipy.special import factorial
|
||||
import copy
|
||||
from functools import lru_cache
|
||||
from tqdm import tqdm
|
||||
|
||||
"""
|
||||
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
|
||||
|
|
@ -80,28 +85,92 @@ class ConfidenceRegionABC(ABC):
|
|||
proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
|
||||
return proportion
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def samples(self):
|
||||
""" Returns internal samples """
|
||||
...
|
||||
|
||||
def __contains__(self, p):
|
||||
"""
|
||||
Overloads in operator, checks if `p` is contained in the region
|
||||
|
||||
:param p: array-like
|
||||
:return: boolean
|
||||
"""
|
||||
p = np.asarray(p)
|
||||
assert p.ndim==1, f'unexpected shape for point parameter'
|
||||
return self.coverage(p)==1.
|
||||
|
||||
def closest_point_in_region(self, p, tol=1e-6, max_iter=30):
|
||||
"""
|
||||
Finds the closes point to p that belongs to the region. Assumes the region is convex.
|
||||
|
||||
:param p: array-like, the point
|
||||
:param tol: float, error tolerance
|
||||
:param max_iter: int, max number of iterations
|
||||
:returns: array-like, the closes point to p in the segment between p and the center of the region, that
|
||||
belongs to the region
|
||||
"""
|
||||
p = np.asarray(p, dtype=float)
|
||||
|
||||
# if p in region, returns p itself
|
||||
if p in self:
|
||||
return p.copy()
|
||||
|
||||
# center of the region
|
||||
c = self.point_estimate()
|
||||
|
||||
# binary search in [0,1], interpolation parameter
|
||||
# low=closest to p, high=closest to c
|
||||
low, high = 0.0, 1.0
|
||||
for _ in range(max_iter):
|
||||
mid = 0.5 * (low + high)
|
||||
x = p*(1-mid) + c*mid
|
||||
if x in self:
|
||||
high = mid
|
||||
else:
|
||||
low = mid
|
||||
if high - low < tol:
|
||||
break
|
||||
|
||||
in_boundary = p*(1-high) + c*high
|
||||
return in_boundary
|
||||
|
||||
|
||||
class WithConfidenceABC(ABC):
|
||||
"""
|
||||
Abstract class for confidence regions.
|
||||
"""
|
||||
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
|
||||
REGION_TYPE = ['intervals', 'ellipse', 'ellipse-clr', 'ellipse-ilr']
|
||||
|
||||
@abstractmethod
|
||||
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
|
||||
"""
|
||||
Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
|
||||
Adds the method `predict_conf` to the interface. This method returns not only the point-estimate, but
|
||||
also the confidence region around it.
|
||||
|
||||
:param instances: a np.ndarray of shape (n_instances, n_features,)
|
||||
:confidence_level: float in (0, 1)
|
||||
:param confidence_level: float in (0, 1), default is 0.95
|
||||
:return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
|
||||
(n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
|
||||
"""
|
||||
...
|
||||
|
||||
def quantify_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
|
||||
"""
|
||||
Alias to `predict_conf`. This method returns not only the point-estimate, but
|
||||
also the confidence region around it.
|
||||
|
||||
:param instances: a np.ndarray of shape (n_instances, n_features,)
|
||||
:param confidence_level: float in (0, 1), default is 0.95
|
||||
:return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
|
||||
(n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
|
||||
"""
|
||||
return self.predict_conf(instances=instances, confidence_level=confidence_level)
|
||||
|
||||
@classmethod
|
||||
def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'):
|
||||
def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals')->ConfidenceRegionABC:
|
||||
"""
|
||||
Construct a confidence region given many prevalence estimations.
|
||||
|
||||
|
|
@ -119,6 +188,8 @@ class WithConfidenceABC(ABC):
|
|||
region = ConfidenceEllipseSimplex(prev_estims, confidence_level=confidence_level)
|
||||
elif method == 'ellipse-clr':
|
||||
region = ConfidenceEllipseCLR(prev_estims, confidence_level=confidence_level)
|
||||
elif method == 'ellipse-ilr':
|
||||
region = ConfidenceEllipseILR(prev_estims, confidence_level=confidence_level)
|
||||
|
||||
if region is None:
|
||||
raise NotImplementedError(f'unknown method {method}')
|
||||
|
|
@ -136,7 +207,7 @@ def simplex_volume(n):
|
|||
return 1 / factorial(n)
|
||||
|
||||
|
||||
def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
|
||||
def within_ellipse_prop__(values, mean, prec_matrix, chi2_critical):
|
||||
"""
|
||||
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
|
||||
at a distance `chi2_critical`.
|
||||
|
|
@ -169,110 +240,126 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
|
|||
return within_elipse * 1.0
|
||||
|
||||
|
||||
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
|
||||
def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
|
||||
"""
|
||||
Instantiates a Confidence Ellipse in the probability simplex.
|
||||
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
|
||||
at a distance `chi2_critical`.
|
||||
|
||||
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
:param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
|
||||
:param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
|
||||
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
|
||||
covariance matrix) of the ellipse. If this inverse cannot be computed
|
||||
then None must be passed
|
||||
:param chi2_critical: float, the chi2 critical value
|
||||
|
||||
:return: float in [0,1], the fraction of values that are contained in the ellipse
|
||||
defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
|
||||
If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
|
||||
"""
|
||||
if prec_matrix is None:
|
||||
return 0.
|
||||
|
||||
values = np.atleast_2d(values)
|
||||
diff = values - mean
|
||||
d_M_squared = np.sum(diff @ prec_matrix * diff, axis=-1)
|
||||
within_ellipse = d_M_squared <= chi2_critical
|
||||
|
||||
if len(within_ellipse) == 1:
|
||||
return float(within_ellipse[0])
|
||||
else:
|
||||
return float(np.mean(within_ellipse))
|
||||
|
||||
|
||||
def closest_point_on_ellipsoid(p, mean, cov, chi2_critical, tol=1e-9, max_iter=100):
|
||||
"""
|
||||
Computes the closest point on the ellipsoid defined by:
|
||||
(x - mean)^T cov^{-1} (x - mean) = chi2_critical
|
||||
"""
|
||||
|
||||
def __init__(self, X, confidence_level=0.95):
|
||||
p = np.asarray(p)
|
||||
mean = np.asarray(mean)
|
||||
Sigma = np.asarray(cov)
|
||||
|
||||
assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)'
|
||||
# Precompute precision matrix
|
||||
P = np.linalg.pinv(Sigma)
|
||||
d = P.shape[0]
|
||||
|
||||
X = np.asarray(X)
|
||||
# Define v = p - mean
|
||||
v = p - mean
|
||||
|
||||
self.mean_ = X.mean(axis=0)
|
||||
self.cov_ = np.cov(X, rowvar=False, ddof=1)
|
||||
# If p is inside the ellipsoid, return p itself
|
||||
M_dist = v @ P @ v
|
||||
if M_dist <= chi2_critical:
|
||||
return p.copy()
|
||||
|
||||
try:
|
||||
self.precision_matrix_ = np.linalg.inv(self.cov_)
|
||||
except:
|
||||
self.precision_matrix_ = None
|
||||
# Function to compute x(lambda)
|
||||
def x_lambda(lmbda):
|
||||
A = np.eye(d) + lmbda * P
|
||||
return mean + np.linalg.solve(A, v)
|
||||
|
||||
self.dim = X.shape[-1]
|
||||
self.ddof = self.dim - 1
|
||||
# Function whose root we want: f(lambda) = Mahalanobis distance - chi2
|
||||
def f(lmbda):
|
||||
x = x_lambda(lmbda)
|
||||
diff = x - mean
|
||||
return diff @ P @ diff - chi2_critical
|
||||
|
||||
# critical chi-square value
|
||||
self.confidence_level = confidence_level
|
||||
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
|
||||
# Bisection search over lambda >= 0
|
||||
l_low, l_high = 0.0, 1.0
|
||||
|
||||
def point_estimate(self):
|
||||
"""
|
||||
Returns the point estimate, the center of the ellipse.
|
||||
# Increase high until f(high) < 0
|
||||
while f(l_high) > 0:
|
||||
l_high *= 2
|
||||
if l_high > 1e12:
|
||||
raise RuntimeError("Failed to bracket the root.")
|
||||
|
||||
:return: np.ndarray of shape (n_classes,)
|
||||
"""
|
||||
return self.mean_
|
||||
# Bisection
|
||||
for _ in range(max_iter):
|
||||
l_mid = 0.5 * (l_low + l_high)
|
||||
fm = f(l_mid)
|
||||
if abs(fm) < tol:
|
||||
break
|
||||
if fm > 0:
|
||||
l_low = l_mid
|
||||
else:
|
||||
l_high = l_mid
|
||||
|
||||
def coverage(self, true_value):
|
||||
"""
|
||||
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||
|
||||
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||
:return: float in [0,1]
|
||||
"""
|
||||
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
|
||||
|
||||
|
||||
class ConfidenceEllipseCLR(ConfidenceRegionABC):
|
||||
"""
|
||||
Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
|
||||
|
||||
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
"""
|
||||
|
||||
def __init__(self, X, confidence_level=0.95):
|
||||
self.clr = CLRtransformation()
|
||||
Z = self.clr(X)
|
||||
self.mean_ = np.mean(X, axis=0)
|
||||
self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
|
||||
|
||||
def point_estimate(self):
|
||||
"""
|
||||
Returns the point estimate, the center of the ellipse.
|
||||
|
||||
:return: np.ndarray of shape (n_classes,)
|
||||
"""
|
||||
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
|
||||
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
|
||||
# return self.clr.inverse(self.mean_) # <- does not coincide
|
||||
return self.mean_
|
||||
|
||||
def coverage(self, true_value):
|
||||
"""
|
||||
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||
|
||||
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||
:return: float in [0,1]
|
||||
"""
|
||||
transformed_values = self.clr(true_value)
|
||||
return self.conf_region_clr.coverage(transformed_values)
|
||||
l_opt = l_mid
|
||||
return x_lambda(l_opt)
|
||||
|
||||
|
||||
class ConfidenceIntervals(ConfidenceRegionABC):
|
||||
"""
|
||||
Instantiates a region based on (independent) Confidence Intervals.
|
||||
|
||||
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
:param bonferroni_correction: bool (default False), if True, a Bonferroni correction
|
||||
is applied to the significance level (`alpha`) before computing confidence intervals.
|
||||
The correction consists of replacing `alpha` with `alpha/n_classes`. When
|
||||
`n_classes=2` the correction is not applied because there is only one verification test
|
||||
since the other class is constrained. This is not necessarily true for n_classes>2.
|
||||
"""
|
||||
def __init__(self, X, confidence_level=0.95):
|
||||
def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False):
|
||||
assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
|
||||
assert samples.ndim == 2, 'unexpected shape; must be (n_bootstrap_samples, n_classes)'
|
||||
|
||||
X = np.asarray(X)
|
||||
samples = np.asarray(samples)
|
||||
|
||||
self.means_ = X.mean(axis=0)
|
||||
self.means_ = samples.mean(axis=0)
|
||||
alpha = 1-confidence_level
|
||||
if bonferroni_correction:
|
||||
n_classes = samples.shape[-1]
|
||||
if n_classes>2:
|
||||
alpha = alpha/n_classes
|
||||
low_perc = (alpha/2.)*100
|
||||
high_perc = (1-alpha/2.)*100
|
||||
self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0)
|
||||
self.I_low, self.I_high = np.percentile(samples, q=[low_perc, high_perc], axis=0)
|
||||
self._samples = samples
|
||||
self.alpha = alpha
|
||||
|
||||
@property
|
||||
def samples(self):
|
||||
return self._samples
|
||||
|
||||
def point_estimate(self):
|
||||
"""
|
||||
|
|
@ -297,33 +384,275 @@ class ConfidenceIntervals(ConfidenceRegionABC):
|
|||
|
||||
return proportion
|
||||
|
||||
def coverage_soft(self, true_value):
|
||||
within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
|
||||
return np.mean(within_intervals.astype(float))
|
||||
|
||||
class CLRtransformation:
|
||||
"""
|
||||
Centered log-ratio, from component analysis
|
||||
"""
|
||||
def __call__(self, X, epsilon=1e-6):
|
||||
"""
|
||||
Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
|
||||
actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
|
||||
def __repr__(self):
|
||||
return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']'
|
||||
|
||||
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
|
||||
:param epsilon: small float for prevalence smoothing
|
||||
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
X = qp.error.smooth(X, epsilon)
|
||||
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
|
||||
return np.log(X / G)
|
||||
@property
|
||||
def n_dim(self):
|
||||
return len(self.I_low)
|
||||
|
||||
def inverse(self, X):
|
||||
"""
|
||||
Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
|
||||
def winkler_scores(self, true_prev, alpha=None, add_ae=False):
|
||||
true_prev = np.asarray(true_prev)
|
||||
assert true_prev.ndim == 1, 'unexpected dimensionality for true_prev'
|
||||
assert len(true_prev)==self.n_dim, \
|
||||
f'unexpected number of dimensions; found {true_prev.ndim}, expected {self.n_dim}'
|
||||
|
||||
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
|
||||
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
|
||||
def winkler_score(low, high, true_val, alpha, center):
|
||||
amp = high-low
|
||||
scale_cost = 2./alpha
|
||||
cost = np.max([0, low-true_val], axis=0) + np.max([0, true_val-high], axis=0)
|
||||
err = 0
|
||||
if add_ae:
|
||||
err = abs(true_val - center)
|
||||
return amp + scale_cost*cost + err
|
||||
|
||||
alpha = alpha or self.alpha
|
||||
return np.asarray(
|
||||
[winkler_score(low_i, high_i, true_v, alpha, center)
|
||||
for (low_i, high_i, true_v, center) in zip(self.I_low, self.I_high, true_prev, self.point_estimate())]
|
||||
)
|
||||
|
||||
def mean_winkler_score(self, true_prev, alpha=None, add_ae=False):
|
||||
return np.mean(self.winkler_scores(true_prev, alpha=alpha, add_ae=add_ae))
|
||||
|
||||
|
||||
|
||||
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
|
||||
"""
|
||||
return softmax(X, axis=-1)
|
||||
Instantiates a Confidence Ellipse in the probability simplex.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
"""
|
||||
|
||||
def __init__(self, samples, confidence_level=0.95):
|
||||
|
||||
assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)'
|
||||
|
||||
samples = np.asarray(samples)
|
||||
|
||||
self.mean_ = samples.mean(axis=0)
|
||||
self.cov_ = np.cov(samples, rowvar=False, ddof=1)
|
||||
|
||||
try:
|
||||
self.precision_matrix_ = np.linalg.pinv(self.cov_)
|
||||
except:
|
||||
self.precision_matrix_ = None
|
||||
|
||||
self.dim = samples.shape[-1]
|
||||
self.ddof = self.dim - 1
|
||||
|
||||
# critical chi-square value
|
||||
self.confidence_level = confidence_level
|
||||
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
|
||||
self._samples = samples
|
||||
self.alpha = 1.-confidence_level
|
||||
|
||||
@property
|
||||
def samples(self):
|
||||
return self._samples
|
||||
|
||||
def point_estimate(self):
|
||||
"""
|
||||
Returns the point estimate, the center of the ellipse.
|
||||
|
||||
:return: np.ndarray of shape (n_classes,)
|
||||
"""
|
||||
return self.mean_
|
||||
|
||||
def coverage(self, true_value):
|
||||
"""
|
||||
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||
|
||||
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||
:return: float in [0,1]
|
||||
"""
|
||||
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
|
||||
|
||||
def closest_point_in_region(self, p, tol=1e-6, max_iter=30):
|
||||
return closest_point_on_ellipsoid(
|
||||
p,
|
||||
mean=self.mean_,
|
||||
cov=self.cov_,
|
||||
chi2_critical=self.chi2_critical_
|
||||
)
|
||||
|
||||
|
||||
class ConfidenceEllipseTransformed(ConfidenceRegionABC):
|
||||
"""
|
||||
Instantiates a Confidence Ellipse in a transformed space.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
"""
|
||||
|
||||
def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95):
|
||||
samples = np.asarray(samples)
|
||||
self.transformation = transformation
|
||||
Z = self.transformation(samples)
|
||||
self.mean_ = np.mean(samples, axis=0)
|
||||
# self.mean_ = self.transformation.inverse(np.mean(Z, axis=0))
|
||||
self.conf_region_z = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
|
||||
self._samples = samples
|
||||
self.alpha = 1.-confidence_level
|
||||
|
||||
@property
|
||||
def samples(self):
|
||||
return self._samples
|
||||
|
||||
def point_estimate(self):
|
||||
"""
|
||||
Returns the point estimate, the center of the ellipse.
|
||||
|
||||
:return: np.ndarray of shape (n_classes,)
|
||||
"""
|
||||
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
|
||||
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
|
||||
# return self.clr.inverse(self.mean_) # <- does not coincide
|
||||
return self.mean_
|
||||
|
||||
def coverage(self, true_value):
|
||||
"""
|
||||
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||
|
||||
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||
:return: float in [0,1]
|
||||
"""
|
||||
transformed_values = self.transformation(true_value)
|
||||
return self.conf_region_z.coverage(transformed_values)
|
||||
|
||||
def closest_point_in_region(self, p, tol=1e-6, max_iter=30):
|
||||
p_prime = self.transformation(p)
|
||||
b_prime = self.conf_region_z.closest_point_in_region(p_prime, tol=tol, max_iter=max_iter)
|
||||
b = self.transformation.inverse(b_prime)
|
||||
return b
|
||||
|
||||
|
||||
class ConfidenceEllipseCLR(ConfidenceEllipseTransformed):
|
||||
"""
|
||||
Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
"""
|
||||
def __init__(self, samples, confidence_level=0.95):
|
||||
super().__init__(samples, CLRtransformation(), confidence_level=confidence_level)
|
||||
|
||||
|
||||
class ConfidenceEllipseILR(ConfidenceEllipseTransformed):
|
||||
"""
|
||||
Instantiates a Confidence Ellipse in the Isometric-Log Ratio (CLR) space.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
"""
|
||||
def __init__(self, samples, confidence_level=0.95):
|
||||
super().__init__(samples, ILRtransformation(), confidence_level=confidence_level)
|
||||
|
||||
|
||||
|
||||
class ConfidenceIntervalsTransformed(ConfidenceRegionABC):
|
||||
"""
|
||||
Instantiates a Confidence Interval region in a transformed space.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
:param bonferroni_correction: bool (default False), if True, a Bonferroni correction
|
||||
is applied to the significance level (`alpha`) before computing confidence intervals.
|
||||
The correction consists of replacing `alpha` with `alpha/n_classes`. When
|
||||
`n_classes=2` the correction is not applied because there is only one verification test
|
||||
since the other class is constrained. This is not necessarily true for n_classes>2.
|
||||
"""
|
||||
|
||||
def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95, bonferroni_correction=False):
|
||||
samples = np.asarray(samples)
|
||||
self.transformation = transformation
|
||||
Z = self.transformation(samples)
|
||||
self.mean_ = np.mean(samples, axis=0)
|
||||
# self.mean_ = self.transformation.inverse(np.mean(Z, axis=0))
|
||||
self.conf_region_z = ConfidenceIntervals(Z, confidence_level=confidence_level, bonferroni_correction=bonferroni_correction)
|
||||
self._samples = samples
|
||||
self.alpha = 1.-confidence_level
|
||||
|
||||
@property
|
||||
def samples(self):
|
||||
return self._samples
|
||||
|
||||
def point_estimate(self):
|
||||
"""
|
||||
Returns the point estimate, the center of the ellipse.
|
||||
|
||||
:return: np.ndarray of shape (n_classes,)
|
||||
"""
|
||||
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
|
||||
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
|
||||
# return self.clr.inverse(self.mean_) # <- does not coincide
|
||||
return self.mean_
|
||||
|
||||
def coverage(self, true_value):
|
||||
"""
|
||||
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||
|
||||
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||
:return: float in [0,1]
|
||||
"""
|
||||
transformed_values = self.transformation(true_value)
|
||||
return self.conf_region_z.coverage(transformed_values)
|
||||
|
||||
def coverage_soft(self, true_value):
|
||||
transformed_values = self.transformation(true_value)
|
||||
return self.conf_region_z.coverage_soft(transformed_values)
|
||||
|
||||
def winkler_scores(self, true_prev, alpha=None, add_ae=False):
|
||||
transformed_values = self.transformation(true_prev)
|
||||
return self.conf_region_z.winkler_scores(transformed_values, alpha=alpha, add_ae=add_ae)
|
||||
|
||||
def mean_winkler_score(self, true_prev, alpha=None, add_ae=False):
|
||||
transformed_values = self.transformation(true_prev)
|
||||
return self.conf_region_z.mean_winkler_score(transformed_values, alpha=alpha, add_ae=add_ae)
|
||||
|
||||
|
||||
class ConfidenceIntervalsCLR(ConfidenceIntervalsTransformed):
|
||||
"""
|
||||
Instantiates a Confidence Intervals in the Centered-Log Ratio (CLR) space.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
:param bonferroni_correction: bool (default False), if True, a Bonferroni correction
|
||||
is applied to the significance level (`alpha`) before computing confidence intervals.
|
||||
The correction consists of replacing `alpha` with `alpha/n_classes`. When
|
||||
`n_classes=2` the correction is not applied because there is only one verification test
|
||||
since the other class is constrained. This is not necessarily true for n_classes>2.
|
||||
"""
|
||||
def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False):
|
||||
super().__init__(samples, CLRtransformation(), confidence_level=confidence_level, bonferroni_correction=bonferroni_correction)
|
||||
|
||||
|
||||
class ConfidenceIntervalsILR(ConfidenceIntervalsTransformed):
|
||||
"""
|
||||
Instantiates a Confidence Intervals in the Isometric-Log Ratio (CLR) space.
|
||||
|
||||
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||
:param confidence_level: float, the confidence level (default 0.95)
|
||||
:param bonferroni_correction: bool (default False), if True, a Bonferroni correction
|
||||
is applied to the significance level (`alpha`) before computing confidence intervals.
|
||||
The correction consists of replacing `alpha` with `alpha/n_classes`. When
|
||||
`n_classes=2` the correction is not applied because there is only one verification test
|
||||
since the other class is constrained. This is not necessarily true for n_classes>2.
|
||||
"""
|
||||
def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False):
|
||||
super().__init__(samples, ILRtransformation(), confidence_level=confidence_level, bonferroni_correction=bonferroni_correction)
|
||||
|
||||
|
||||
|
||||
class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
||||
|
|
@ -339,6 +668,12 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
|
||||
During inference, the bootstrap repetitions are applied to the pre-classified test instances.
|
||||
|
||||
See
|
||||
`Moreo, A., Salvati, N.
|
||||
An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification.
|
||||
Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025.
|
||||
pp 12-33 <https://lq-2025.github.io/proceedings/CompleteVolume.pdf>`_
|
||||
|
||||
:param quantifier: an aggregative quantifier
|
||||
:para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
|
||||
model-based bootstrap approach)
|
||||
|
|
@ -357,7 +692,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
n_test_samples=500,
|
||||
confidence_level=0.95,
|
||||
region='intervals',
|
||||
random_state=None):
|
||||
random_state=None,
|
||||
verbose=False):
|
||||
|
||||
assert isinstance(quantifier, AggregativeQuantifier), \
|
||||
f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
|
||||
|
|
@ -374,15 +710,28 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.random_state = random_state
|
||||
self.verbose = verbose
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
|
||||
|
||||
self.quantifiers = []
|
||||
if self.n_train_samples==1:
|
||||
self.quantifier.aggregation_fit(classif_predictions, labels)
|
||||
self.quantifiers.append(self.quantifier)
|
||||
else:
|
||||
if classif_predictions is None or labels is None:
|
||||
# The entire dataset was consumed for classifier training, implying there is no need for training
|
||||
# an aggregation function. If the bootstrap method was configured to train different aggregators
|
||||
# (i.e., self.n_train_samples>1), then an error is raise. Otherwise, the method ends.
|
||||
if self.n_train_samples > 1:
|
||||
raise ValueError(
|
||||
f'The underlying quantifier ({self.quantifier.__class__.__name__}) has consumed, all training '
|
||||
f'data, meaning the aggregation function needs none, but {self.n_train_samples=} is > 1, which '
|
||||
f'is inconsistent.'
|
||||
)
|
||||
else:
|
||||
# model-based bootstrap (only on the aggregative part)
|
||||
data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
|
||||
n_examples = len(data)
|
||||
full_index = np.arange(n_examples)
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
|
|
@ -399,7 +748,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
|
||||
return prev_mean
|
||||
|
||||
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
|
||||
def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
|
||||
|
|
@ -407,7 +756,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
prevs = []
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
for quantifier in self.quantifiers:
|
||||
for i in range(self.n_test_samples):
|
||||
for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
|
||||
sample_i = resample(classif_predictions, n_samples=n_samples)
|
||||
prev_i = quantifier.aggregate(sample_i)
|
||||
prevs.append(prev_i)
|
||||
|
|
@ -417,13 +766,33 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
|
||||
return prev_estim, conf
|
||||
|
||||
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
|
||||
confidence_level = confidence_level or self.confidence_level
|
||||
|
||||
|
||||
n_samples = classif_predictions.shape[0]
|
||||
prevs = []
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
for quantifier in self.quantifiers:
|
||||
results = Parallel(n_jobs=-1)(
|
||||
delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples)
|
||||
for i in range(self.n_test_samples)
|
||||
)
|
||||
prevs.extend(results)
|
||||
|
||||
prevs = np.array(prevs)
|
||||
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
|
||||
prev_estim = conf.point_estimate()
|
||||
|
||||
return prev_estim, conf
|
||||
|
||||
def fit(self, X, y):
|
||||
self.quantifier._check_init_parameters()
|
||||
classif_predictions, labels = self.quantifier.classifier_fit_predict(X, y)
|
||||
self.aggregation_fit(classif_predictions, labels)
|
||||
return self
|
||||
|
||||
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
predictions = self.quantifier.classify(instances)
|
||||
return self.aggregate_conf(predictions, confidence_level=confidence_level)
|
||||
|
||||
|
|
@ -435,9 +804,16 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
return self.quantifier._classifier_method()
|
||||
|
||||
|
||||
def bootstrap_once(i, classif_predictions, quantifier, n_samples):
|
||||
idx = np.random.randint(0, len(classif_predictions), n_samples)
|
||||
sample = classif_predictions[idx]
|
||||
prev = quantifier.aggregate(sample)
|
||||
return prev
|
||||
|
||||
|
||||
class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
||||
"""
|
||||
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method,
|
||||
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method (by Albert Ziegler and Paweł Czyż),
|
||||
which is a variant of :class:`ACC` that calculates the posterior probability distribution
|
||||
over the prevalence vectors, rather than providing a point estimate obtained
|
||||
by matrix inversion.
|
||||
|
|
@ -464,6 +840,8 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
|||
:param region: string, set to `intervals` for constructing confidence intervals (default), or to
|
||||
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
|
||||
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
|
||||
:param prior: an array-list with the alpha parameters of a Dirichlet prior, or the string 'uniform'
|
||||
for a uniform, uninformative prior (default)
|
||||
"""
|
||||
def __init__(self,
|
||||
classifier: BaseEstimator=None,
|
||||
|
|
@ -473,12 +851,17 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
|||
num_samples: int = 1_000,
|
||||
mcmc_seed: int = 0,
|
||||
confidence_level: float = 0.95,
|
||||
region: str = 'intervals'):
|
||||
region: str = 'intervals',
|
||||
temperature = 1.,
|
||||
prior = 'uniform'):
|
||||
|
||||
if num_warmup <= 0:
|
||||
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
|
||||
if num_samples <= 0:
|
||||
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
||||
assert ((isinstance(prior, str) and prior == 'uniform') or
|
||||
(isinstance(prior, Iterable) and all(isinstance(v, Number) for v in prior))), \
|
||||
f'wrong type for {prior=}; expected "uniform" or an array-like of real values'
|
||||
|
||||
if _bayesian.DEPENDENCIES_INSTALLED is False:
|
||||
raise ImportError("Auxiliary dependencies are required. "
|
||||
|
|
@ -490,6 +873,8 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
|||
self.mcmc_seed = mcmc_seed
|
||||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.temperature = temperature
|
||||
self.prior = prior
|
||||
|
||||
# Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances
|
||||
# labeled as class y and predicted as class c.
|
||||
|
|
@ -520,11 +905,19 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
|||
|
||||
n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_).astype(float)
|
||||
|
||||
self._samples = _bayesian.sample_posterior(
|
||||
n_classes = len(self.classifier.classes_)
|
||||
if isinstance(self.prior, str) and self.prior == 'uniform':
|
||||
alpha = np.asarray([1.] * n_classes)
|
||||
else:
|
||||
alpha = np.asarray(self.prior)
|
||||
|
||||
self._samples = _bayesian.sample_posterior_bayesianCC(
|
||||
n_c_unlabeled=n_c_unlabeled,
|
||||
n_y_and_c_labeled=self._n_and_c_labeled,
|
||||
num_warmup=self.num_warmup,
|
||||
num_samples=self.num_samples,
|
||||
alpha=alpha,
|
||||
temperature=self.temperature,
|
||||
seed=self.mcmc_seed,
|
||||
)
|
||||
return self._samples
|
||||
|
|
@ -543,9 +936,115 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
|||
samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y]
|
||||
return np.asarray(samples.mean(axis=0), dtype=float)
|
||||
|
||||
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
classif_predictions = self.classify(instances)
|
||||
point_estimate = self.aggregate(classif_predictions)
|
||||
samples = self.get_prevalence_samples() # available after calling "aggregate" function
|
||||
region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region)
|
||||
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
|
||||
return point_estimate, region
|
||||
|
||||
|
||||
class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
|
||||
"""
|
||||
`Precise Quantifier: Bayesian distribution matching quantifier <https://arxiv.org/abs/2507.06061>,
|
||||
which is a variant of :class:`HDy` that calculates the posterior probability distribution
|
||||
over the prevalence vectors, rather than providing a point estimate.
|
||||
|
||||
This method relies on extra dependencies, which have to be installed via:
|
||||
`$ pip install quapy[bayes]`
|
||||
|
||||
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
|
||||
the one indicated in `qp.environ['DEFAULT_CLS']`
|
||||
:param val_split: specifies the data used for generating classifier predictions. This specification
|
||||
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
|
||||
be extracted from the training set; or as an integer (default 5), indicating that the predictions
|
||||
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
||||
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
|
||||
None when the method does not require any validation data, in order to avoid that some portion of
|
||||
the training data be wasted.
|
||||
:param num_warmup: number of warmup iterations for the STAN sampler (default 500)
|
||||
:param num_samples: number of samples to draw from the posterior (default 1000)
|
||||
:param stan_seed: random seed for the STAN sampler (default 0)
|
||||
:param region: string, set to `intervals` for constructing confidence intervals (default), or to
|
||||
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
|
||||
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
|
||||
"""
|
||||
def __init__(self,
|
||||
classifier: BaseEstimator=None,
|
||||
fit_classifier=True,
|
||||
val_split: int = 5,
|
||||
nbins: int = 4,
|
||||
fixed_bins: bool = False,
|
||||
num_warmup: int = 500,
|
||||
num_samples: int = 1_000,
|
||||
stan_seed: int = 0,
|
||||
confidence_level: float = 0.95,
|
||||
region: str = 'intervals'):
|
||||
|
||||
if num_warmup <= 0:
|
||||
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
|
||||
if num_samples <= 0:
|
||||
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
||||
|
||||
if not _bayesian.DEPENDENCIES_INSTALLED:
|
||||
raise ImportError("Auxiliary dependencies are required. "
|
||||
"Run `$ pip install quapy[bayes]` to install them.")
|
||||
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
|
||||
self.nbins = nbins
|
||||
self.fixed_bins = fixed_bins
|
||||
self.num_warmup = num_warmup
|
||||
self.num_samples = num_samples
|
||||
self.stan_seed = stan_seed
|
||||
self.stan_code = _bayesian.load_stan_file()
|
||||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
y_pred = classif_predictions[:, self.pos_label]
|
||||
|
||||
# Compute bin limits
|
||||
if self.fixed_bins:
|
||||
# Uniform bins in [0,1]
|
||||
self.bin_limits = np.linspace(0, 1, self.nbins + 1)
|
||||
else:
|
||||
# Quantile bins
|
||||
self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.nbins + 1))
|
||||
|
||||
# Assign each prediction to a bin
|
||||
bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True)
|
||||
|
||||
# Positive and negative masks
|
||||
pos_mask = (labels == self.pos_label)
|
||||
neg_mask = ~pos_mask
|
||||
|
||||
# Count positives and negatives per bin
|
||||
self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.nbins)
|
||||
self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.nbins)
|
||||
|
||||
def aggregate(self, classif_predictions):
|
||||
Px_test = classif_predictions[:, self.pos_label]
|
||||
test_hist, _ = np.histogram(Px_test, bins=self.bin_limits)
|
||||
prevs = _bayesian.pq_stan(
|
||||
self.stan_code, self.nbins, self.pos_hist, self.neg_hist, test_hist,
|
||||
self.num_samples, self.num_warmup, self.stan_seed
|
||||
).flatten()
|
||||
self.prev_distribution = np.vstack([1-prevs, prevs]).T
|
||||
return self.prev_distribution.mean(axis=0)
|
||||
|
||||
def aggregate_conf(self, predictions, confidence_level=None):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
point_estimate = self.aggregate(predictions)
|
||||
samples = self.prev_distribution
|
||||
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
|
||||
return point_estimate, region
|
||||
|
||||
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||
predictions = self.classify(instances)
|
||||
return self.aggregate_conf(predictions, confidence_level=confidence_level)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,17 @@
|
|||
from typing import Union, Callable
|
||||
from itertools import product
|
||||
from tqdm import tqdm
|
||||
from typing import Union, Callable, Counter
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.utils import resample
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
|
||||
from quapy.functional import get_divergence
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||
import quapy.functional as F
|
||||
from scipy.optimize import lsq_linear
|
||||
from scipy import sparse
|
||||
|
||||
|
||||
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||
|
|
@ -149,53 +155,164 @@ class DMx(BaseQuantifier):
|
|||
return F.argmin_prevalence(loss, n_classes, method=self.search)
|
||||
|
||||
|
||||
# class ReadMe(BaseQuantifier):
|
||||
#
|
||||
# def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
|
||||
# raise NotImplementedError('under development ...')
|
||||
# self.bootstrap_trials = bootstrap_trials
|
||||
# self.bootstrap_range = bootstrap_range
|
||||
# self.bagging_trials = bagging_trials
|
||||
# self.bagging_range = bagging_range
|
||||
# self.vectorizer_kwargs = vectorizer_kwargs
|
||||
#
|
||||
# def fit(self, data: LabelledCollection):
|
||||
# X, y = data.Xy
|
||||
# self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
|
||||
# X = self.vectorizer.fit_transform(X)
|
||||
# self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
|
||||
#
|
||||
# def predict(self, X):
|
||||
# X = self.vectorizer.transform(X)
|
||||
#
|
||||
# # number of features
|
||||
# num_docs, num_feats = X.shape
|
||||
#
|
||||
# # bootstrap
|
||||
# p_boots = []
|
||||
# for _ in range(self.bootstrap_trials):
|
||||
# docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
|
||||
# class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
|
||||
# Xboot = X[docs_idx]
|
||||
#
|
||||
# # bagging
|
||||
# p_bags = []
|
||||
# for _ in range(self.bagging_trials):
|
||||
# feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
|
||||
# class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
|
||||
# Xbag = Xboot[:,feat_idx]
|
||||
# p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
|
||||
# p_bags.append(p)
|
||||
# p_boots.append(np.mean(p_bags, axis=0))
|
||||
#
|
||||
# p_mean = np.mean(p_boots, axis=0)
|
||||
# p_std = np.std(p_bags, axis=0)
|
||||
#
|
||||
# return p_mean
|
||||
#
|
||||
#
|
||||
# def std_constrained_linear_ls(self, X, class_cond_X: dict):
|
||||
# pass
|
||||
|
||||
|
||||
class ReadMe(BaseQuantifier, WithConfidenceABC):
|
||||
"""
|
||||
ReadMe is a non-aggregative quantification system proposed by
|
||||
`Daniel Hopkins and Gary King, 2007. A method of automated nonparametric content analysis for
|
||||
social science. American Journal of Political Science, 54(1):229–247.
|
||||
<https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-5907.2009.00428.x>`_.
|
||||
The idea is to estimate `Q(Y=i)` directly from:
|
||||
|
||||
:math:`Q(X)=\\sum_{i=1} Q(X|Y=i) Q(Y=i)`
|
||||
|
||||
via least-squares regression, i.e., without incurring the cost of computing posterior probabilities.
|
||||
However, this poses a very difficult representation in which the vector `Q(X)` and the matrix `Q(X|Y=i)`
|
||||
can be of very high dimensions. In order to render the problem tracktable, ReadMe performs bagging in
|
||||
the feature space. ReadMe also combines bagging with bootstrap in order to derive confidence intervals
|
||||
around point estimations.
|
||||
|
||||
We use the same default parameters as in the official
|
||||
`R implementation <https://github.com/iqss-research/ReadMeV1/blob/master/R/prototype.R>`_.
|
||||
|
||||
:param prob_model: str ('naive', or 'full'), selects the modality in which the probabilities `Q(X)` and
|
||||
`Q(X|Y)` are to be modelled. Options include "full", which corresponds to the original formulation of
|
||||
ReadMe, in which X is constrained to be a binary matrix (e.g., of term presence/absence) and in which
|
||||
`Q(X)` and `Q(X|Y)` are modelled, respectively, as matrices of `(2^K, 1)` and `(2^K, n)` values, where
|
||||
`K` is the number of columns in the data matrix (i.e., `bagging_range`), and `n` is the number of classes.
|
||||
Of course, this approach is computationally prohibited for large `K`, so the computation is restricted to data
|
||||
matrices with `K<=25` (although we recommend even smaller values of `K`). A much faster model is "naive", which
|
||||
considers the `Q(X)` and `Q(X|Y)` be multinomial distributions under the `bag-of-words` perspective. In this
|
||||
case, `bagging_range` can be set to much larger values. Default is "full" (i.e., original ReadMe behavior).
|
||||
:param bootstrap_trials: int, number of bootstrap trials (default 300)
|
||||
:param bagging_trials: int, number of bagging trials (default 300)
|
||||
:param bagging_range: int, number of features to keep for each bagging trial (default 15)
|
||||
:param confidence_level: float, a value in (0,1) reflecting the desired confidence level (default 0.95)
|
||||
:param region: str in 'intervals', 'ellipse', 'ellipse-clr'; indicates the preferred method for
|
||||
defining the confidence region (see :class:`WithConfidenceABC`)
|
||||
:param random_state: int or None, allows replicability (default None)
|
||||
:param verbose: bool, whether to display information during the process (default False)
|
||||
"""
|
||||
|
||||
MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION = 25
|
||||
PROBABILISTIC_MODELS = ["naive", "full"]
|
||||
|
||||
def __init__(self,
|
||||
prob_model="full",
|
||||
bootstrap_trials=300,
|
||||
bagging_trials=300,
|
||||
bagging_range=15,
|
||||
confidence_level=0.95,
|
||||
region='intervals',
|
||||
random_state=None,
|
||||
verbose=False):
|
||||
assert prob_model in ReadMe.PROBABILISTIC_MODELS, \
|
||||
f'unknown {prob_model=}, valid ones are {ReadMe.PROBABILISTIC_MODELS=}'
|
||||
self.prob_model = prob_model
|
||||
self.bootstrap_trials = bootstrap_trials
|
||||
self.bagging_trials = bagging_trials
|
||||
self.bagging_range = bagging_range
|
||||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.random_state = random_state
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y):
|
||||
self._check_matrix(X)
|
||||
|
||||
self.rng = np.random.default_rng(self.random_state)
|
||||
self.classes_ = np.unique(y)
|
||||
|
||||
|
||||
Xsize = X.shape[0]
|
||||
|
||||
# Bootstrap loop
|
||||
self.Xboots, self.yboots = [], []
|
||||
for _ in range(self.bootstrap_trials):
|
||||
idx = self.rng.choice(Xsize, size=Xsize, replace=True)
|
||||
self.Xboots.append(X[idx])
|
||||
self.yboots.append(y[idx])
|
||||
|
||||
return self
|
||||
|
||||
def predict_conf(self, X, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
|
||||
self._check_matrix(X)
|
||||
|
||||
n_features = X.shape[1]
|
||||
boots_prevalences = []
|
||||
for Xboots, yboots in tqdm(
|
||||
zip(self.Xboots, self.yboots),
|
||||
desc='bootstrap predictions', total=self.bootstrap_trials, disable=not self.verbose
|
||||
):
|
||||
bagging_estimates = []
|
||||
for _ in range(self.bagging_trials):
|
||||
feat_idx = self.rng.choice(n_features, size=self.bagging_range, replace=False)
|
||||
Xboots_bagging = Xboots[:, feat_idx]
|
||||
X_boots_bagging = X[:, feat_idx]
|
||||
bagging_prev = self._quantify_iteration(Xboots_bagging, yboots, X_boots_bagging)
|
||||
bagging_estimates.append(bagging_prev)
|
||||
|
||||
boots_prevalences.append(np.mean(bagging_estimates, axis=0))
|
||||
|
||||
conf = WithConfidenceABC.construct_region(boots_prevalences, confidence_level, method=self.region)
|
||||
prev_estim = conf.point_estimate()
|
||||
|
||||
return prev_estim, conf
|
||||
|
||||
def predict(self, X):
|
||||
prev_estim, _ = self.predict_conf(X)
|
||||
return prev_estim
|
||||
|
||||
def _quantify_iteration(self, Xtr, ytr, Xte):
|
||||
"""Single ReadMe estimate."""
|
||||
PX_given_Y = np.asarray([self._compute_P(Xtr[ytr == c]) for i,c in enumerate(self.classes_)])
|
||||
PX = self._compute_P(Xte)
|
||||
|
||||
res = lsq_linear(A=PX_given_Y.T, b=PX, bounds=(0, 1))
|
||||
pY = np.maximum(res.x, 0)
|
||||
return pY / pY.sum()
|
||||
|
||||
def _check_matrix(self, X):
|
||||
"""the "full" model requires estimating empirical distributions; due to the high computational cost,
|
||||
this function is only made available for binary matrices"""
|
||||
if self.prob_model == 'full' and not self._is_binary_matrix(X):
|
||||
raise ValueError('the empirical distribution can only be computed efficiently on binary matrices')
|
||||
|
||||
def _is_binary_matrix(self, X):
|
||||
data = X.data if sparse.issparse(X) else X
|
||||
return np.all((data == 0) | (data == 1))
|
||||
|
||||
def _compute_P(self, X):
|
||||
if self.prob_model == 'naive':
|
||||
return self._multinomial_distribution(X)
|
||||
elif self.prob_model == 'full':
|
||||
return self._empirical_distribution(X)
|
||||
else:
|
||||
raise ValueError(f'unknown {self.prob_model}; valid ones are {ReadMe.PROBABILISTIC_MODELS=}')
|
||||
|
||||
def _empirical_distribution(self, X):
|
||||
|
||||
if X.shape[1] > self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION:
|
||||
raise ValueError(f'the empirical distribution can only be computed efficiently for dimensions '
|
||||
f'less or equal than {self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION}')
|
||||
|
||||
# we convert every binary row (e.g., 0 0 1 0 1) into the equivalent number (e.g., 5)
|
||||
K = X.shape[1]
|
||||
binary_powers = 1 << np.arange(K-1, -1, -1) # (2^K, ..., 32, 16, 8, 4, 2, 1)
|
||||
X_as_binary_numbers = X @ binary_powers
|
||||
|
||||
# count occurrences and compute probs
|
||||
counts = np.bincount(X_as_binary_numbers, minlength=2 ** K).astype(float)
|
||||
probs = counts / counts.sum()
|
||||
return probs
|
||||
|
||||
def _multinomial_distribution(self, X):
|
||||
PX = np.asarray(X.sum(axis=0))
|
||||
PX = normalize(PX, norm='l1', axis=1)
|
||||
return PX.ravel()
|
||||
|
||||
|
||||
|
||||
|
||||
def _get_features_range(X):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,39 @@
|
|||
data {
|
||||
int<lower=0> n_bucket;
|
||||
array[n_bucket] int<lower=0> train_pos;
|
||||
array[n_bucket] int<lower=0> train_neg;
|
||||
array[n_bucket] int<lower=0> test;
|
||||
int<lower=0,upper=1> posterior;
|
||||
}
|
||||
|
||||
transformed data{
|
||||
row_vector<lower=0>[n_bucket] train_pos_rv;
|
||||
row_vector<lower=0>[n_bucket] train_neg_rv;
|
||||
row_vector<lower=0>[n_bucket] test_rv;
|
||||
real n_test;
|
||||
|
||||
train_pos_rv = to_row_vector( train_pos );
|
||||
train_neg_rv = to_row_vector( train_neg );
|
||||
test_rv = to_row_vector( test );
|
||||
n_test = sum( test );
|
||||
}
|
||||
|
||||
parameters {
|
||||
simplex[n_bucket] p_neg;
|
||||
simplex[n_bucket] p_pos;
|
||||
real<lower=0,upper=1> prev_prior;
|
||||
}
|
||||
|
||||
model {
|
||||
if( posterior ) {
|
||||
target += train_neg_rv * log( p_neg );
|
||||
target += train_pos_rv * log( p_pos );
|
||||
target += test_rv * log( p_neg * ( 1 - prev_prior) + p_pos * prev_prior );
|
||||
}
|
||||
}
|
||||
|
||||
generated quantities {
|
||||
real<lower=0,upper=1> prev;
|
||||
prev = sum( binomial_rng(test, 1 / ( 1 + (p_neg./p_pos) *(1-prev_prior)/prev_prior ) ) ) / n_test;
|
||||
}
|
||||
|
||||
|
|
@ -410,7 +410,7 @@ def group_params(param_grid: dict):
|
|||
"""
|
||||
classifier_params, quantifier_params = {}, {}
|
||||
for key, values in param_grid.items():
|
||||
if key.startswith('classifier__') or key == 'val_split':
|
||||
if 'classifier__' in key or key == 'val_split':
|
||||
classifier_params[key] = values
|
||||
else:
|
||||
quantifier_params[key] = values
|
||||
|
|
|
|||
|
|
@ -171,7 +171,7 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
|||
return sample
|
||||
|
||||
|
||||
class OnLabelledCollectionProtocol:
|
||||
class OnLabelledCollectionProtocol(AbstractStochasticSeededProtocol):
|
||||
"""
|
||||
Protocols that generate samples from a :class:`qp.data.LabelledCollection` object.
|
||||
"""
|
||||
|
|
@ -229,8 +229,17 @@ class OnLabelledCollectionProtocol:
|
|||
elif return_type=='index':
|
||||
return lambda lc,params:params
|
||||
|
||||
def sample(self, index):
|
||||
"""
|
||||
Realizes the sample given the index of the instances.
|
||||
|
||||
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||
:param index: indexes of the instances to select
|
||||
:return: an instance of :class:`qp.data.LabelledCollection`
|
||||
"""
|
||||
return self.data.sampling_from_index(index)
|
||||
|
||||
|
||||
class APP(OnLabelledCollectionProtocol):
|
||||
"""
|
||||
Implementation of the artificial prevalence protocol (APP).
|
||||
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
|
||||
|
|
@ -311,15 +320,6 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
indexes.append(index)
|
||||
return indexes
|
||||
|
||||
def sample(self, index):
|
||||
"""
|
||||
Realizes the sample given the index of the instances.
|
||||
|
||||
:param index: indexes of the instances to select
|
||||
:return: an instance of :class:`qp.data.LabelledCollection`
|
||||
"""
|
||||
return self.data.sampling_from_index(index)
|
||||
|
||||
def total(self):
|
||||
"""
|
||||
Returns the number of samples that will be generated
|
||||
|
|
@ -329,7 +329,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
|
||||
|
||||
|
||||
class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||
class NPP(OnLabelledCollectionProtocol):
|
||||
"""
|
||||
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
|
||||
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
|
||||
|
|
@ -365,15 +365,6 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
indexes.append(index)
|
||||
return indexes
|
||||
|
||||
def sample(self, index):
|
||||
"""
|
||||
Realizes the sample given the index of the instances.
|
||||
|
||||
:param index: indexes of the instances to select
|
||||
:return: an instance of :class:`qp.data.LabelledCollection`
|
||||
"""
|
||||
return self.data.sampling_from_index(index)
|
||||
|
||||
def total(self):
|
||||
"""
|
||||
Returns the number of samples that will be generated (equals to "repeats")
|
||||
|
|
@ -383,7 +374,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
return self.repeats
|
||||
|
||||
|
||||
class UPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||
class UPP(OnLabelledCollectionProtocol):
|
||||
"""
|
||||
A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
|
||||
relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
|
||||
|
|
@ -423,14 +414,53 @@ class UPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
|||
indexes.append(index)
|
||||
return indexes
|
||||
|
||||
def sample(self, index):
|
||||
def total(self):
|
||||
"""
|
||||
Realizes the sample given the index of the instances.
|
||||
Returns the number of samples that will be generated (equals to "repeats")
|
||||
|
||||
:param index: indexes of the instances to select
|
||||
:return: an instance of :class:`qp.data.LabelledCollection`
|
||||
:return: int
|
||||
"""
|
||||
return self.data.sampling_from_index(index)
|
||||
return self.repeats
|
||||
|
||||
|
||||
class DirichletProtocol(OnLabelledCollectionProtocol):
|
||||
"""
|
||||
A protocol that establishes a prior Dirichlet distribution for the prevalence of the samples.
|
||||
Note that providing an all-ones vector of Dirichlet parameters is equivalent to invoking the
|
||||
APP protocol (although each protocol will generate a different series of samples given a
|
||||
fixed seed, since the implementation is different).
|
||||
|
||||
:param data: a `LabelledCollection` from which the samples will be drawn
|
||||
:param alpha: an array-like of shape (n_classes,) with the parameters of the Dirichlet distribution
|
||||
:param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from
|
||||
qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised.
|
||||
:param repeats: the number of samples to generate. Default is 100.
|
||||
:param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples
|
||||
will be the same every time the protocol is called)
|
||||
:param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or
|
||||
to "labelled_collection" to get instead instances of LabelledCollection
|
||||
"""
|
||||
|
||||
def __init__(self, data: LabelledCollection, alpha, sample_size=None, repeats=100, random_state=0,
|
||||
return_type='sample_prev'):
|
||||
assert len(alpha)>1, 'wrong parameters: alpha must be an array-like of shape (n_classes,)'
|
||||
super(DirichletProtocol, self).__init__(random_state)
|
||||
self.data = data
|
||||
self.alpha = alpha
|
||||
self.sample_size = qp._get_sample_size(sample_size)
|
||||
self.repeats = repeats
|
||||
self.random_state = random_state
|
||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||
|
||||
def samples_parameters(self):
|
||||
"""
|
||||
Return all the necessary parameters to replicate the samples.
|
||||
|
||||
:return: a list of indexes that realize the sampling
|
||||
"""
|
||||
prevs = np.random.dirichlet(self.alpha, size=self.repeats)
|
||||
indexes = [self.data.sampling_index(self.sample_size, *prevs_i) for prevs_i in prevs]
|
||||
return indexes
|
||||
|
||||
def total(self):
|
||||
"""
|
||||
|
|
@ -450,7 +480,7 @@ class DomainMixer(AbstractStochasticSeededProtocol):
|
|||
:param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from
|
||||
qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised.
|
||||
:param repeats: int, number of samples to draw for every mixture rate
|
||||
:param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing
|
||||
:param prevalence: the prevalence to preserve along the mixtures. If specified, should be an array containing
|
||||
one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence
|
||||
will be taken from the domain A (default).
|
||||
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import unittest
|
|||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import BayesianKDEy.commons
|
||||
import quapy as qp
|
||||
from quapy.method.aggregative import ACC
|
||||
from quapy.method.meta import Ensemble
|
||||
|
|
@ -47,7 +48,7 @@ class TestMethods(unittest.TestCase):
|
|||
learner.fit(*dataset.training.Xy)
|
||||
|
||||
for model in AGGREGATIVE_METHODS:
|
||||
if not dataset.binary and model in BINARY_METHODS:
|
||||
if not BayesianKDEy.commons.binary and model in BINARY_METHODS:
|
||||
print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}')
|
||||
continue
|
||||
|
||||
|
|
@ -61,7 +62,7 @@ class TestMethods(unittest.TestCase):
|
|||
for dataset in TestMethods.datasets:
|
||||
|
||||
for model in NON_AGGREGATIVE_METHODS:
|
||||
if not dataset.binary and model in BINARY_METHODS:
|
||||
if not BayesianKDEy.commons.binary and model in BINARY_METHODS:
|
||||
print(f'skipping the test of binary model {model.__name__} on multiclass dataset {dataset.name}')
|
||||
continue
|
||||
|
||||
|
|
@ -76,7 +77,7 @@ class TestMethods(unittest.TestCase):
|
|||
|
||||
base_quantifier = ACC(LogisticRegression())
|
||||
for dataset, policy in itertools.product(TestMethods.datasets, Ensemble.VALID_POLICIES):
|
||||
if not dataset.binary and policy == 'ds':
|
||||
if not BayesianKDEy.commons.binary and policy == 'ds':
|
||||
print(f'skipping the test of binary policy ds on non-binary dataset {dataset}')
|
||||
continue
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import unittest
|
|||
import numpy as np
|
||||
|
||||
import quapy.functional
|
||||
from protocol import DirichletProtocol
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.protocol import APP, NPP, UPP, DomainMixer, AbstractStochasticSeededProtocol
|
||||
|
||||
|
|
@ -138,6 +139,31 @@ class TestProtocols(unittest.TestCase):
|
|||
|
||||
self.assertNotEqual(samples1, samples2)
|
||||
|
||||
def test_dirichlet_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = DirichletProtocol(data, alpha=[1,2,3,4], sample_size=5, repeats=10, random_state=42)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertEqual(samples1, samples2)
|
||||
|
||||
p = DirichletProtocol(data, alpha=[1,2,3,4], sample_size=5, repeats=10, random_state=0)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertEqual(samples1, samples2)
|
||||
|
||||
def test_dirichlet_not_replicate(self):
|
||||
data = mock_labelled_collection()
|
||||
p = DirichletProtocol(data, alpha=[1,2,3,4], sample_size=5, repeats=10, random_state=None)
|
||||
|
||||
samples1 = samples_to_str(p)
|
||||
samples2 = samples_to_str(p)
|
||||
|
||||
self.assertNotEqual(samples1, samples2)
|
||||
|
||||
def test_covariate_shift_replicate(self):
|
||||
dataA = mock_labelled_collection('domA')
|
||||
dataB = mock_labelled_collection('domB')
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 9d433e3e35b4d111a3914a1e7d3257a8fcf24a9b
|
||||
8
setup.py
8
setup.py
|
|
@ -111,6 +111,12 @@ setup(
|
|||
#
|
||||
packages=find_packages(include=['quapy', 'quapy.*']), # Required
|
||||
|
||||
package_data={
|
||||
# For the 'quapy.method' package, include all files
|
||||
# in the 'stan' subdirectory that end with .stan
|
||||
'quapy.method': ['stan/*.stan']
|
||||
},
|
||||
|
||||
python_requires='>=3.8, <4',
|
||||
|
||||
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib', 'joblib', 'xlrd', 'abstention', 'ucimlrepo', 'certifi'],
|
||||
|
|
@ -124,7 +130,7 @@ setup(
|
|||
# Similar to `install_requires` above, these must be valid existing
|
||||
# projects.
|
||||
extras_require={ # Optional
|
||||
'bayes': ['jax', 'jaxlib', 'numpyro'],
|
||||
'bayes': ['jax', 'jaxlib', 'numpyro', 'pystan'],
|
||||
'neural': ['torch'],
|
||||
'tests': ['certifi'],
|
||||
'docs' : ['sphinx-rtd-theme', 'myst-parser'],
|
||||
|
|
|
|||
Loading…
Reference in New Issue