Compare commits

...

76 Commits
master ... ieee

Author SHA1 Message Date
Alejandro Moreo Fernandez 1661a79dbb quantification over time experiment 2025-12-10 19:43:08 +01:00
Alejandro Moreo Fernandez 5c2554861c new plots 2025-12-10 13:11:49 +01:00
Alejandro Moreo Fernandez 1eebfbc709 refactor folders 2025-12-10 13:10:03 +01:00
Alejandro Moreo Fernandez 8c063afd1e merged from devel 2025-12-10 12:06:03 +01:00
Alejandro Moreo Fernandez befe7503bd merged from devel 2025-12-10 12:05:10 +01:00
Alejandro Moreo Fernandez 3824a19e10 merged from cacm 2025-12-10 11:57:57 +01:00
Alejandro Moreo Fernandez b2e2794012 renaming branch 2025-12-10 11:52:32 +01:00
Alejandro Moreo Fernandez 89a8cad0b3 atichison distance and tests for evaluating regions 2025-12-09 18:12:06 +01:00
Alejandro Moreo Fernandez c492415977 adding bootstrap-emq 2025-12-08 12:40:44 +01:00
Alejandro Moreo Fernandez 7cb4bd550f adding bootstrap-emq 2025-12-08 12:31:23 +01:00
Alejandro Moreo Fernandez 7b75954f9b adding bootstrap-emq 2025-12-08 12:17:54 +01:00
Alejandro Moreo Fernandez 7342e57cda trying with emcee with no success... 2025-12-08 12:14:43 +01:00
Alejandro Moreo Fernandez b2750ab6ea added temperature, and coverage increases! 2025-12-06 14:06:14 +01:00
Alejandro Moreo Fernandez 602b89bd21 added temperature, and coverage increases! 2025-12-06 13:53:34 +01:00
Alejandro Moreo Fernandez 84c11d956b added temperature, and coverage increases! 2025-12-06 13:51:49 +01:00
Alejandro Moreo Fernandez 7c7af2cfaf added temperature, and coverage increases! 2025-12-06 13:50:15 +01:00
Alejandro Moreo Fernandez 2e0068b6ae import fix 2025-12-06 13:36:13 +01:00
Alejandro Moreo Fernandez d52fc40d2b ilr added 2025-12-06 13:20:11 +01:00
Alejandro Moreo Fernandez 59ef17c86c ilr only on multiclass 2025-12-04 20:10:25 +01:00
Alejandro Moreo Fernandez a6974b7624 adding experiment with ILR 2025-12-04 20:03:30 +01:00
Alejandro Moreo Fernandez e8d175106f adding experiment with ILR 2025-12-04 20:02:26 +01:00
Alejandro Moreo Fernandez b180aae16c added ILR to conf regions 2025-12-04 19:16:52 +01:00
Alejandro Moreo Fernandez d87625bd09 adding new config for bayesian 2025-12-04 18:27:15 +01:00
Alejandro Moreo Fernandez 90981088b0 launching PQ 2025-12-04 18:00:01 +01:00
Alejandro Moreo Fernandez 78fd05ab33 repairing safe check 2025-12-04 15:43:31 +01:00
Alejandro Moreo Fernandez 1080973d25 recomputing confidence regions 2025-12-04 15:42:00 +01:00
Alejandro Moreo Fernandez bfb6482410 launching kde** 2025-12-04 10:34:38 +01:00
Alejandro Moreo Fernandez 23608f2038 adding exploration in CLR 2025-12-04 10:24:02 +01:00
Alejandro Moreo Fernandez 881e1033f1 lauching experiments 2025-11-26 15:19:08 +01:00
Alejandro Moreo Fernandez a3f0008a2a Merge branch 'devel' into kdeplus 2025-11-18 10:13:56 +01:00
Alejandro Moreo Fernandez 6db659e3c4 unifying n_bins in PQ and DMy 2025-11-18 10:13:34 +01:00
Alejandro Moreo Fernandez 277a2e617f added aggregative bootstrap 2025-11-18 10:12:41 +01:00
Alejandro Moreo Fernandez be465712cd Merge branch 'devel' into kdeplus 2025-11-17 17:55:15 +01:00
Alejandro Moreo Fernandez db49cd31be Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel 2025-11-17 17:54:09 +01:00
Alejandro Moreo Fernandez 9da4fd57db added property samples to confidence regions 2025-11-17 17:53:56 +01:00
Alejandro Moreo Fernandez fdc0560ccc no optim classifier 2025-11-17 17:47:14 +01:00
Alejandro Moreo Fernandez fd62e73d2d adding bootstrap variants 2025-11-17 12:37:12 +01:00
Alejandro Moreo Fernandez 12b431ef4b scripting experiments binary and multiclass 2025-11-17 12:22:40 +01:00
Alejandro Moreo Fernandez 2f83a520c7 drafting experiments 2025-11-16 12:42:26 +01:00
Alejandro Moreo Fernandez 4255098ba7 creating class BayesianKDEy 2025-11-15 21:47:30 +01:00
Alejandro Moreo Fernandez 8a8988d2de Merge branch 'kdeplus' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into kdeplus 2025-11-15 19:01:42 +01:00
Alejandro Moreo Fernandez f324ca049e Merge branch 'devel' into kdeplus 2025-11-15 18:57:56 +01:00
Alejandro Moreo Fernandez 5f6a151263 Merge branch 'pglez82-precisequant' into devel 2025-11-15 18:55:38 +01:00
Alejandro Moreo Fernandez 047cb9e533 merged 2025-11-15 18:54:04 +01:00
Alejandro Moreo Fernandez 6388d9b549 merged 2025-11-15 18:03:06 +01:00
Alejandro Moreo Fernandez d9cf6cc11d index in labelled collection from versions restored 2025-11-15 17:56:37 +01:00
pglez82 46e7246f3a adding stan file to setup 2025-11-15 17:04:55 +01:00
pglez82 c6492a0f20 fixing import 2025-11-15 17:04:44 +01:00
pglez82 e4c07e1835 changing the way the file is loaded 2025-11-15 16:51:48 +01:00
Alejandro Moreo Fernandez c2044cb200 Merge branch 'precisequant' of github.com:pglez82/QuaPy into pglez82-precisequant 2025-11-15 15:03:20 +01:00
pglez82 3268e9fada PQ (precise quantifier) 2025-11-14 18:35:40 +01:00
Alejandro Moreo Fernandez a868d2d561 import fix 2025-11-14 16:10:17 +01:00
Alejandro Moreo Fernandez ccb634fae5 step rate adaptation 2025-11-14 16:09:34 +01:00
Alejandro Moreo Fernandez 3dba708fe4 Merge branch 'devel' into kdeplus 2025-11-14 12:09:23 +01:00
Alejandro Moreo Fernandez 7ad5311fac merging from devel 2025-11-14 12:07:32 +01:00
Alejandro Moreo Fernandez 400edfdb63 bayesian plot 2025-11-13 19:43:07 +01:00
Alejandro Moreo Fernandez 3c09b1c98a adding prev@densities in KDEyML, huge effiency improvement... 2025-11-13 18:45:07 +01:00
Alejandro Moreo Fernandez 3e7a431d26 bayeisan kdey 2025-11-13 18:43:03 +01:00
Alejandro Moreo Fernandez f227ed2f60 adding kdex 2025-10-23 14:12:39 +02:00
Alejandro Moreo Fernandez 41baeb78ca lazy index construction in labelled collection 2025-10-23 12:35:01 +02:00
Alejandro Moreo Fernandez 088ebcdd31 adding non aggregative methods experimental 2025-10-23 12:10:21 +02:00
Alejandro Moreo Fernandez c11b99e08a improved ReadMe method 2025-10-22 18:51:35 +02:00
Alejandro Moreo Fernandez 854b3ba3f9 documented ReadMe 2025-10-20 18:33:45 +02:00
Alejandro Moreo Fernandez eafe486893 adding readme to non-aggregative 2025-10-20 18:13:34 +02:00
Alejandro Moreo Fernandez 1fb8500e87 improved doc 2025-10-09 12:49:08 +02:00
Alejandro Moreo Fernandez d597820a59 missing doc of confidence methods 2025-10-09 12:10:26 +02:00
Alejandro Moreo Fernandez 010676df12 starting devel 0.2.1 2025-10-07 10:27:59 +02:00
Alejandro Moreo Fernandez bd9f8e2cb4 show results fix 2025-09-30 12:02:13 +02:00
Alejandro Moreo Fernandez 79f3709e6f adding mrae 2025-09-27 18:12:56 +02:00
Alejandro Moreo Fernandez e4cb7868c7 adding mrae 2025-09-27 18:03:24 +02:00
Alejandro Moreo Fernandez 606ec2b89c with tables in pdf 2025-09-27 18:01:39 +02:00
Alejandro Moreo Fernandez 27afea8bf1 with tables in pdf 2025-09-27 18:01:21 +02:00
Alejandro Moreo Fernandez 29bb261f62 testing kde normal 2025-09-27 17:47:52 +02:00
Alejandro Moreo Fernandez c3fd92efde experiment 2025-09-27 17:41:12 +02:00
Alejandro Moreo Fernandez bb0950fad5 code used to generate plots 2023-11-13 12:07:59 +01:00
Alejandro Moreo Fernandez 2e992a0b9a choosing plots for paper 2023-11-10 14:22:43 +01:00
40 changed files with 3613 additions and 210 deletions

7
BayesianKDEy/TODO.txt Normal file
View File

@ -0,0 +1,7 @@
- Add other methods that natively provide uncertainty quantification methods?
- MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever)
- Implement Interval Score or Winkler Score
- analyze across shift
- add Bayesian EM
- optimize also C and class_weight?

View File

@ -0,0 +1,215 @@
from numpy.ma.core import shape
from sklearn.base import BaseEstimator
import numpy as np
import quapy.util
from quapy.method._kdey import KDEBase
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
from quapy.functional import CLRtransformation, ILRtransformation
from quapy.method.aggregative import AggregativeSoftQuantifier
from tqdm import tqdm
import quapy.functional as F
#import emcee
class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
"""
`Bayesian version of KDEy.
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']`
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
None when the method does not require any validation data, in order to avoid that some portion of
the training data be wasted.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0)
:param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95)
:param region: string, set to `intervals` for constructing confidence intervals (default), or to
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
:param verbose: bool, whether to display progress bar
"""
def __init__(self,
classifier: BaseEstimator=None,
fit_classifier=True,
val_split: int = 5,
kernel='gaussian',
bandwidth=0.1,
num_warmup: int = 500,
num_samples: int = 1_000,
mcmc_seed: int = 0,
confidence_level: float = 0.95,
region: str = 'intervals',
explore='simplex',
step_size=0.05,
temperature=1.,
verbose: bool = False):
if num_warmup <= 0:
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer')
assert explore in ['simplex', 'clr', 'ilr'], \
f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"'
assert temperature>0., f'temperature must be >0'
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
self.kernel = self._check_kernel(kernel)
self.num_warmup = num_warmup
self.num_samples = num_samples
self.mcmc_seed = mcmc_seed
self.confidence_level = confidence_level
self.region = region
self.explore = explore
self.step_size = step_size
self.temperature = temperature
self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels):
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
return self
def aggregate(self, classif_predictions):
# self.prevalence_samples = self._bayesian_kde(classif_predictions, init=None, verbose=self.verbose)
self.prevalence_samples = self._bayesian_emcee(classif_predictions)
return self.prevalence_samples.mean(axis=0)
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
if confidence_level is None:
confidence_level = self.confidence_level
classif_predictions = self.classify(instances)
point_estimate = self.aggregate(classif_predictions)
samples = self.prevalence_samples # available after calling "aggregate" function
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
return point_estimate, region
def _bayesian_kde(self, X_probs, init=None, verbose=False):
"""
Bayes:
P(prev|data) = P(data|prev) P(prev) / P(data)
i.e.,
posterior = likelihood * prior / evidence
we assume the likelihood be:
prev @ [kde_i_likelihood(data) 1..i..n]
prior be uniform in simplex
"""
rng = np.random.default_rng(self.mcmc_seed)
kdes = self.mix_densities
test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes])
def log_likelihood(prev, epsilon=1e-10):
test_likelihoods = prev @ test_densities
test_loglikelihood = np.log(test_likelihoods + epsilon)
return (1./self.temperature) * np.sum(test_loglikelihood)
# def log_prior(prev):
# todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence)
# return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant
# def log_prior(prev, alpha_scale=1000):
# alpha = np.array(init) * alpha_scale
# return dirichlet.logpdf(prev, alpha)
def log_prior(prev):
return 0
def sample_neighbour(prev, step_size):
# random-walk Metropolis-Hastings
d = len(prev)
neighbour = None
if self.explore=='simplex':
dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d)
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
elif self.explore=='clr':
clr = CLRtransformation()
clr_point = clr(prev)
dir_noise = rng.normal(scale=step_size, size=d)
clr_neighbour = clr_point+dir_noise
neighbour = clr.inverse(clr_neighbour)
assert in_simplex(neighbour), 'wrong CLR transformation'
elif self.explore=='ilr':
ilr = ILRtransformation()
ilr_point = ilr(prev)
dir_noise = rng.normal(scale=step_size, size=d-1)
ilr_neighbour = ilr_point + dir_noise
neighbour = ilr.inverse(ilr_neighbour)
assert in_simplex(neighbour), 'wrong ILR transformation'
return neighbour
n_classes = X_probs.shape[1]
current_prev = F.uniform_prevalence(n_classes) if init is None else init
current_likelihood = log_likelihood(current_prev) + log_prior(current_prev)
# Metropolis-Hastings with adaptive rate
step_size = self.step_size
target_acceptance = 0.3
adapt_rate = 0.05
acceptance_history = []
samples = []
total_steps = self.num_samples + self.num_warmup
for i in tqdm(range(total_steps), total=total_steps, disable=not verbose):
proposed_prev = sample_neighbour(current_prev, step_size)
# probability of acceptance
proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev)
acceptance = proposed_likelihood - current_likelihood
# decide acceptance
accepted = np.log(rng.random()) < acceptance
if accepted:
current_prev = proposed_prev
current_likelihood = proposed_likelihood
samples.append(current_prev)
acceptance_history.append(1. if accepted else 0.)
# if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100:
if i % 10 == 0 and len(acceptance_history) >= 100:
recent_accept_rate = np.mean(acceptance_history[-100:])
step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance))
# step_size = float(np.clip(step_size, min_step, max_step))
if i %100==0:
print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}')
# remove "warmup" initial iterations
samples = np.asarray(samples[self.num_warmup:])
return samples
def _bayesian_emcee(self, X_probs):
ndim = X_probs.shape[1]
nwalkers = 32
f = CLRtransformation()
def log_likelihood(unconstrained, test_densities, epsilon=1e-10):
prev = f.inverse(unconstrained)
test_likelihoods = prev @ test_densities
test_loglikelihood = np.log(test_likelihoods + epsilon)
return np.sum(test_loglikelihood)
kdes = self.mix_densities
test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes])
# p0 = np.random.normal(nwalkers, ndim)
p0 = F.uniform_prevalence_sampling(ndim, nwalkers)
p0 = f(p0)
sampler = emcee.EnsembleSampler(nwalkers, ndim, log_likelihood, args=[test_densities])
state = sampler.run_mcmc(p0, self.num_warmup, skip_initial_state_check=True)
sampler.reset()
sampler.run_mcmc(state, self.num_samples, skip_initial_state_check=True)
samples = sampler.get_chain(flat=True)
samples = f.inverse(samples)
return samples
def in_simplex(x):
return np.all(x >= 0) and np.isclose(x.sum(), 1)

View File

@ -0,0 +1,196 @@
import os
import warnings
from os.path import join
from pathlib import Path
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from copy import deepcopy as cp
import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from build.lib.quapy.data import LabelledCollection
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ
from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ
from quapy.data import Dataset
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
from quapy.functional import strprev
from quapy.method.aggregative import KDEyML, ACC
from quapy.protocol import UPP
import quapy.functional as F
import numpy as np
from tqdm import tqdm
from scipy.stats import dirichlet
from collections import defaultdict
from time import time
from sklearn.base import clone, BaseEstimator
class KDEyCLR(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='aitchison'
)
class KDEyILR(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='ilr'
)
def methods():
"""
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
- quantifier: is the base model (e.g., KDEyML())
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
quantifier with optimized hyperparameters
"""
acc_hyper = {}
emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
multiclass_method = 'multiclass'
only_binary = 'only_binary'
only_multiclass = 'only_multiclass'
# yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method
# yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method
yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method
# yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method
# yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary
#
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method
# yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method
# yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method
# yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method
# yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass
# yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass
def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
with qp.util.temp_seed(0):
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
# model selection
if len(grid)>0:
train, val = train.split_stratified(train_prop=0.6, random_state=0)
mod_sel = GridSearchQ(
model=point_quantifier,
param_grid=grid,
protocol=qp.protocol.UPP(val, repeats=250, random_state=0),
refit=False,
n_jobs=-1,
verbose=True
).fit(*train.Xy)
best_params = mod_sel.best_params_
else:
best_params = {}
return best_params
def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path):
with qp.util.temp_seed(0):
training, test = dataset.train_test
# model selection
best_hyperparams = qp.util.pickled_resource(
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
)
t_init = time()
withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
tr_time = time() - t_init
# test
train_prevalence = training.prevalence()
results = defaultdict(list)
test_generator = UPP(test, repeats=100, random_state=0)
for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'):
t_init = time()
point_estimate, region = withconf_quantifier.predict_conf(sample_X)
ttime = time()-t_init
results['true-prevs'].append(true_prevalence)
results['point-estim'].append(point_estimate)
results['shift'].append(qp.error.ae(true_prevalence, train_prevalence))
results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate))
results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate))
results['coverage'].append(region.coverage(true_prevalence))
results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000))
results['test-time'].append(ttime)
results['samples'].append(region.samples)
report = {
'optim_hyper': best_hyperparams,
'train_time': tr_time,
'train-prev': train_prevalence,
'results': {k:np.asarray(v) for k,v in results.items()}
}
return report
def experiment_path(dir:Path, dataset_name:str, method_name:str):
os.makedirs(dir, exist_ok=True)
return dir/f'{dataset_name}__{method_name}.pkl'
if __name__ == '__main__':
binary = {
'datasets': qp.datasets.UCI_BINARY_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
'sample_size': 500
}
multiclass = {
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
'sample_size': 1000
}
result_dir = Path('./results')
for setup in [binary, multiclass]: # [binary, multiclass]:
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
for data_name in setup['datasets']:
print(f'dataset={data_name}')
# if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
# print(f'skipping dataset: {data_name}')
# continue
data = setup['fetch_fn'](data_name)
is_binary = data.n_classes==2
result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
for method_name, method, hyper_params, withconf_constructor, method_scope in methods():
if method_scope == 'only_binary' and not is_binary:
continue
if method_scope == 'only_multiclass' and is_binary:
continue
result_path = experiment_path(result_subdir, data_name, method_name)
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
report = qp.util.pickled_resource(
result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path
)
print(f'dataset={data_name}, '
f'method={method_name}: '
f'mae={report["results"]["ae"].mean():.3f}, '
f'coverage={report["results"]["coverage"].mean():.5f}, '
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

View File

@ -0,0 +1,170 @@
import pickle
from collections import defaultdict
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd
from glob import glob
from pathlib import Path
import quapy as qp
from error import dist_aitchison
from quapy.method.confidence import ConfidenceIntervals
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
def region_score(true_prev, region: ConfidenceRegionABC):
amp = region.montecarlo_proportion(50_000)
if true_prev in region:
cost = 0
else:
scale_cost = 1/region.alpha
cost = scale_cost * dist_aitchison(true_prev, region.closest_point_in_region(true_prev))
return amp + cost
def compute_coverage_amplitude(region_constructor, **kwargs):
all_samples = results['samples']
all_true_prevs = results['true-prevs']
def process_one(samples, true_prevs):
region = region_constructor(samples, **kwargs)
if isinstance(region, ConfidenceIntervals):
winkler = region.mean_winkler_score(true_prevs)
else:
winkler = None
return region.coverage(true_prevs), region.montecarlo_proportion(), winkler
out = Parallel(n_jobs=3)(
delayed(process_one)(samples, true_prevs)
for samples, true_prevs in tqdm(
zip(all_samples, all_true_prevs),
total=len(all_samples),
desc='constructing ellipses'
)
)
# unzip results
coverage, amplitude, winkler = zip(*out)
return list(coverage), list(amplitude), list(winkler)
def update_pickle(report, pickle_path, updated_dict:dict):
for k,v in updated_dict.items():
report[k]=v
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwargs):
if f'coverage-{conf_name}' not in report:
covs, amps, winkler = compute_coverage_amplitude(conf_region_class, **kwargs)
update_fields = {
f'coverage-{conf_name}': covs,
f'amplitude-{conf_name}': amps,
f'winkler-{conf_name}': winkler
}
update_pickle(report, file, update_fields)
methods = None # show all methods
# methods = ['BayesianACC', 'BayesianKDEy']
for setup in ['multiclass']:
path = f'./results/{setup}/*.pkl'
table = defaultdict(list)
for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
file = Path(file)
dataset, method = file.name.replace('.pkl', '').split('__')
if methods is not None and method not in methods:
continue
report = pickle.load(open(file, 'rb'))
results = report['results']
n_samples = len(results['ae'])
table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
table['dataset'].extend([dataset] * n_samples)
table['ae'].extend(results['ae'])
table['rae'].extend(results['rae'])
# table['c-CI'].extend(results['coverage'])
# table['a-CI'].extend(results['amplitude'])
update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True)
update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex)
update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR)
update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR)
table['c-CI'].extend(report['coverage-CI'])
table['a-CI'].extend(report['amplitude-CI'])
table['w-CI'].extend(report['winkler-CI'])
table['c-CE'].extend(report['coverage-CE'])
table['a-CE'].extend(report['amplitude-CE'])
table['c-CLR'].extend(report['coverage-CLR'])
table['a-CLR'].extend(report['amplitude-CLR'])
table['c-ILR'].extend(report['coverage-ILR'])
table['a-ILR'].extend(report['amplitude-ILR'])
table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim']))
# table['aitch-well'].extend(qp.error.dist_aitchison(results['true-prevs'], [ConfidenceEllipseILR(samples).mean_ for samples in results['samples']]))
# table['aitch'].extend()
table['reg-score-ILR'].extend(
[region_score(true_prev, ConfidenceEllipseILR(samples)) for true_prev, samples in zip(results['true-prevs'], results['samples'])]
)
df = pd.DataFrame(table)
n_classes = {}
tr_size = {}
for dataset in df['dataset'].unique():
fetch_fn = {
'binary': qp.datasets.fetch_UCIBinaryDataset,
'multiclass': qp.datasets.fetch_UCIMulticlassDataset
}[setup]
data = fetch_fn(dataset)
n_classes[dataset] = data.n_classes
tr_size[dataset] = len(data.training)
# remove datasets with more than max_classes classes
# max_classes = 30
# min_train = 1000
# for data_name, n in n_classes.items():
# if n > max_classes:
# df = df[df["dataset"] != data_name]
# for data_name, n in tr_size.items():
# if n < min_train:
# df = df[df["dataset"] != data_name]
for region in ['ILR']: # , 'CI', 'CE', 'CLR', 'ILR']:
if setup == 'binary' and region=='ILR':
continue
# pv = pd.pivot_table(
# df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True
# )
pv = pd.pivot_table(
df, index='dataset', columns='method', values=[
#f'w-{region}',
# 'ae',
# 'rae',
# f'aitch',
# f'aitch-well'
'reg-score-ILR',
], margins=True
)
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
print(f'{setup=}')
print(pv)
print('-'*80)

View File

@ -0,0 +1,258 @@
import os
import pickle
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.stats import gaussian_kde
from method.confidence import (ConfidenceIntervals as CI,
ConfidenceEllipseSimplex as CE,
ConfidenceEllipseCLR as CLR,
ConfidenceEllipseILR as ILR)
def get_region_colormap(name="blue", alpha=0.40):
name = name.lower()
if name == "blue":
base = (76/255, 114/255, 176/255)
elif name == "orange":
base = (221/255, 132/255, 82/255)
elif name == "violet":
base = (129/255, 114/255, 178/255)
else:
raise ValueError(f"Unknown palette name: {name}")
cmap = ListedColormap([
(1, 1, 1, 0), # 0: transparent white
(base[0], base[1], base[2], alpha) # 1: color
])
return cmap
def plot_prev_points(samples=None,
show_samples=True,
true_prev=None,
point_estim=None, train_prev=None, show_mean=True, show_legend=True,
region=None,
region_resolution=1000,
confine_region_in_simplex=False,
color='blue',
save_path=None):
plt.rcParams.update({
'font.size': 10, # tamaño base de todo el texto
'axes.titlesize': 12, # título del eje
'axes.labelsize': 10, # etiquetas de ejes
'xtick.labelsize': 8, # etiquetas de ticks
'ytick.labelsize': 8,
'legend.fontsize': 9, # leyenda
})
def cartesian(p):
dim = p.shape[-1]
p = p.reshape(-1,dim)
x = p[:, 1] + p[:, 2] * 0.5
y = p[:, 2] * np.sqrt(3) / 2
return x, y
def barycentric_from_xy(x, y):
"""
Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3).
"""
p3 = 2 * y / np.sqrt(3)
p2 = x - 0.5 * p3
p1 = 1 - p2 - p3
return np.stack([p1, p2, p3], axis=-1)
# simplex coordinates
v1 = np.array([0, 0])
v2 = np.array([1, 0])
v3 = np.array([0.5, np.sqrt(3)/2])
# Plot
fig, ax = plt.subplots(figsize=(6, 6))
if region is not None:
if callable(region):
region_list = [("region", region)]
else:
region_list = region # lista de (name, fn)
if region is not None:
# rectangular mesh
x_min, x_max = -0.2, 1.2
y_min, y_max = -0.2, np.sqrt(3) / 2 + 0.2
xs = np.linspace(x_min, x_max, region_resolution)
ys = np.linspace(y_min, y_max, region_resolution)
grid_x, grid_y = np.meshgrid(xs, ys)
# barycentric
pts_bary = barycentric_from_xy(grid_x, grid_y)
# mask within simplex
if confine_region_in_simplex:
in_simplex = np.all(pts_bary >= 0, axis=-1)
else:
in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool)
# --- Colormap 0 → blanco, 1 → rojo semitransparente ---
# iterar sobre todas las regiones
for (rname, rfun) in region_list:
mask = np.zeros_like(in_simplex, dtype=float)
valid_pts = pts_bary[in_simplex]
mask_vals = np.array([float(rfun(p)) for p in valid_pts])
mask[in_simplex] = mask_vals
ax.pcolormesh(
xs, ys, mask,
shading='auto',
cmap=get_region_colormap(color),
alpha=0.3,
)
if samples is not None:
if show_samples:
ax.scatter(*cartesian(samples), s=15, alpha=0.5, edgecolors='none', label='samples', color='black', linewidth=0.5)
if show_mean is not None:
if isinstance(show_mean, np.ndarray):
ax.scatter(*cartesian(show_mean), s=10, alpha=1, label='sample-mean', edgecolors='black')
elif show_mean==True and samples is not None:
ax.scatter(*cartesian(samples.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black')
else:
raise ValueError(f'show_mean should either be a boolean (if True, then samples must be provided) or '
f'the mean point itself')
if train_prev is not None:
ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black')
if point_estim is not None:
ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black')
if train_prev is not None:
ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black')
# edges
triangle = np.array([v1, v2, v3, v1])
ax.plot(triangle[:, 0], triangle[:, 1], color='black')
# vertex labels
ax.text(-0.05, -0.05, "Y=1", ha='right', va='top')
ax.text(1.05, -0.05, "Y=2", ha='left', va='top')
ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha='center', va='bottom')
ax.set_aspect('equal')
ax.axis('off')
if show_legend:
plt.legend(
loc='center left',
bbox_to_anchor=(1.05, 0.5),
)
plt.tight_layout()
if save_path is None:
plt.show()
else:
os.makedirs(Path(save_path).parent, exist_ok=True)
plt.savefig(save_path)
def plot_prev_points_matplot(points):
# project 2D
v1 = np.array([0, 0])
v2 = np.array([1, 0])
v3 = np.array([0.5, np.sqrt(3) / 2])
x = points[:, 1] + points[:, 2] * 0.5
y = points[:, 2] * np.sqrt(3) / 2
# kde
xy = np.vstack([x, y])
kde = gaussian_kde(xy, bw_method=0.25)
xmin, xmax = 0, 1
ymin, ymax = 0, np.sqrt(3) / 2
# grid
xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
positions = np.vstack([xx.ravel(), yy.ravel()])
zz = np.reshape(kde(positions).T, xx.shape)
# mask points in simplex
def in_triangle(x, y):
return (y >= 0) & (y <= np.sqrt(3) * np.minimum(x, 1 - x))
mask = in_triangle(xx, yy)
zz_masked = np.ma.array(zz, mask=~mask)
# plot
fig, ax = plt.subplots(figsize=(6, 6))
ax.imshow(
np.rot90(zz_masked),
cmap=plt.cm.viridis,
extent=[xmin, xmax, ymin, ymax],
alpha=0.8,
)
# Bordes del triángulo
triangle = np.array([v1, v2, v3, v1])
ax.plot(triangle[:, 0], triangle[:, 1], color='black', lw=2)
# Puntos (opcional)
ax.scatter(x, y, s=5, c='white', alpha=0.3)
# Etiquetas
ax.text(-0.05, -0.05, "A (1,0,0)", ha='right', va='top')
ax.text(1.05, -0.05, "B (0,1,0)", ha='left', va='top')
ax.text(0.5, np.sqrt(3) / 2 + 0.05, "C (0,0,1)", ha='center', va='bottom')
ax.set_aspect('equal')
ax.axis('off')
plt.show()
if __name__ == '__main__':
np.random.seed(1)
n = 1000
# alpha = [3,5,10]
alpha = [10,1,1]
prevs = np.random.dirichlet(alpha, size=n)
def regions():
confs = [0.99, 0.95, 0.90]
yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
# yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs]
# yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs]
# resolution = 1000
# alpha_str = ','.join([f'{str(i)}' for i in alpha])
# for crname, cr in regions():
# plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr, region_resolution=resolution,
# color='blue',
# save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png',
# )
def regions():
confs = [0.99, 0.95, 0.90]
yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs]
# yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs]
# yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs]
# yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs]
resolution = 1000
alpha_str = ','.join([f'{str(i)}' for i in alpha])
region = ILR(prevs, confidence_level=.99)
p = np.asarray([0.1, 0.8, 0.1])
plot_prev_points(prevs, show_samples=False,
show_mean=region.mean_,
# show_mean=prevs.mean(axis=0),
show_legend=False, region=[('', region.coverage)], region_resolution=resolution,
color='blue',
true_prev=p,
train_prev=region.closest_point_in_region(p),
save_path=f'./plots3/simplex_ilr.png',
)

View File

@ -0,0 +1,56 @@
import warnings
from sklearn.linear_model import LogisticRegression
import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
from method.confidence import ConfidenceIntervals
from quapy.functional import strprev
from quapy.method.aggregative import KDEyML
from quapy.protocol import UPP
import quapy.functional as F
import numpy as np
from tqdm import tqdm
from scipy.stats import dirichlet
if __name__ == '__main__':
qp.environ["SAMPLE_SIZE"] = 500
cls = LogisticRegression()
bayes_kdey = BayesianKDEy(cls, bandwidth=.3, kernel='aitchison', mcmc_seed=0)
datasets = qp.datasets.UCI_BINARY_DATASETS
train, test = qp.datasets.fetch_UCIBinaryDataset(datasets[0]).train_test
# train, test = qp.datasets.fetch_UCIMulticlassDataset('academic-success', standardize=True).train_test
with qp.util.temp_seed(0):
print('fitting KDEy')
bayes_kdey.fit(*train.Xy)
shifted = test.sampling(500, *[0.2, 0.8])
# shifted = test.sampling(500, *test.prevalence()[::-1])
# shifted = test.sampling(500, *F.uniform_prevalence_sampling(train.n_classes))
prev_hat = bayes_kdey.predict(shifted.X)
mae = qp.error.mae(shifted.prevalence(), prev_hat)
print(f'true_prev={strprev(shifted.prevalence())}')
print(f'prev_hat={strprev(prev_hat)}, {mae=:.4f}')
prev_hat, conf_interval = bayes_kdey.predict_conf(shifted.X)
mae = qp.error.mae(shifted.prevalence(), prev_hat)
print(f'mean posterior {strprev(prev_hat)}, {mae=:.4f}')
print(f'CI={conf_interval}')
print(f'\tcontains true={conf_interval.coverage(true_value=shifted.prevalence())==1}')
print(f'\tamplitude={conf_interval.montecarlo_proportion(50_000)*100.:.3f}%')
if train.n_classes == 3:
plot_prev_points(bayes_kdey.prevalence_samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence())
# plot_prev_points_matplot(samples)
# report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True)
# print(report.mean(numeric_only=True))

View File

@ -0,0 +1,91 @@
import os
import warnings
from os.path import join
from pathlib import Path
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from copy import deepcopy as cp
import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR
from build.lib.quapy.data import LabelledCollection
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ
from quapy.data import Dataset
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
from quapy.functional import strprev
from quapy.method.aggregative import KDEyML, ACC
from quapy.protocol import UPP
import quapy.functional as F
import numpy as np
from tqdm import tqdm
from scipy.stats import dirichlet
from collections import defaultdict
from time import time
from sklearn.base import clone, BaseEstimator
def methods():
"""
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
- quantifier: is the base model (e.g., KDEyML())
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
quantifier with optimized hyperparameters
"""
acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
# for T in [10, 20, 50, 100., 500]:
# yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper),
yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper),
if __name__ == '__main__':
binary = {
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
'sample_size': 500
}
multiclass = {
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
'sample_size': 1000
}
setup = multiclass
# data_name = 'isolet'
# data_name = 'cmc'
data_name = 'abalone'
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
print(f'dataset={data_name}')
data = setup['fetch_fn'](data_name)
is_binary = data.n_classes==2
hyper_subdir = Path('./results') / 'hyperparams' / ('binary' if is_binary else 'multiclass')
for method_name, method, hyper_params, withconf_constructor in methods():
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path)
print(f'dataset={data_name}, '
f'method={method_name}: '
f'mae={report["results"]["ae"].mean():.3f}, '
f'coverage={report["results"]["coverage"].mean():.5f}, '
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

View File

@ -1,7 +1,16 @@
Change Log 0.2.0
Change Log 0.2.1
-----------------
CLEAN TODO-FILE
- Added squared ratio error.
- Improved efficiency of confidence regions coverage functions
- Added Precise Quantifier to WithConfidence methods (a Bayesian adaptation of HDy)
- Improved documentation of confidence regions.
- Added ReadMe method by Daniel Hopkins and Gary King
- Internal index in LabelledCollection is now "lazy", and is only constructed if required.
- I have added dist_aitchison and mean_dist_aitchison as a new error evaluation metric
Change Log 0.2.0
-----------------
- Base code Refactor:
- Removing coupling between LabelledCollection and quantification methods; the fit interface changes:

View File

@ -0,0 +1,300 @@
import os
from collections import defaultdict
from typing import List, Dict
import matplotlib.pyplot as plt
import kagglehub
import pandas as pd
from pathlib import Path
import numpy as np
from qunfold import KMM
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV
from tqdm import tqdm
import quapy as qp
from data import LabelledCollection, Dataset
import quapy.functional as F
from method.composable import QUnfoldWrapper
from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML
from quapy.method.non_aggregative import DistributionMatchingX
from quapy.method.aggregative import CC, ACC, HDy
from transformers import pipeline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
def prepare_xy_date_blocks(df, freq="M"):
"""
df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created'
freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.)
Devuelve:
X: lista de textos
y: np.ndarray de etiquetas
date: lista de índices enteros por bloque temporal
idx2date: lista con los límites temporales de cada bloque (tuplas)
"""
df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce")
df = df.sort_values("tweet_created").reset_index(drop=True)
X = df["text"].astype(str).values
y = df["airline_sentiment"].values
# group dates by requested frequency
date_groups = df["tweet_created"].dt.to_period(freq)
# assigns index to date blocks
unique_periods = date_groups.unique()
period_to_idx = {p: i for i, p in enumerate(unique_periods)}
date = np.asarray([period_to_idx[p] for p in date_groups])
# get true limits of period intervals
idx2date = []
for p in unique_periods:
start = p.start_time
end = p.end_time
idx2date.append((start, end))
return X, y, date, idx2date
def prepare_labelled_collections():
# loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle)
# returns a labelled collection for the training data (day 0 and 1), and a list of the
# test sets (days 2 to 8) and the time limits for each test period
# The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral
# Download latest version
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")
df = pd.read_csv(Path(path) / 'Tweets.csv')
X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D")
# binarize
keep_idx = (y!='neutral')
X = X[keep_idx]
y = y[keep_idx]
date = date[keep_idx]
y[y != 'negative'] = 1
y[y == 'negative'] = 0
y = y.astype(int)
# use day 0 for training, the rest for test
X_train, y_train = X[date<=1], y[date<=1]
train = LabelledCollection(X_train, y_train)
print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}')
tests = []
test_init = []
for date_i in range(2, max(date)+1):
X_test_i, y_test_i = X[date==date_i], y[date==date_i]
test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes)
print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}')
tests.append(test_i)
test_init.append(idx2date[date_i])
return train, tests, test_init
from scipy.interpolate import CubicSpline
import numpy as np
import matplotlib.pyplot as plt
def smooth_curve(dates, values, num_points=300):
"""
dates: list of timestamps
values: list of Y-values
num_points: number of points in the smooth curve
Returns new_x, new_y for plotting a smooth line.
"""
# Convert datetime to numeric (matplotlib float representation)
x = [d.timestamp() for d in dates]
x = np.array(x)
y = np.array(values)
# Create new X-axis with more points
x_new = np.linspace(x.min(), x.max(), num_points)
# Smooth spline
spline = CubicSpline(x, y)
y_new = spline(x_new)
# Convert numeric x_new back to datetime
dates_new = [pd.to_datetime(t, unit='s') for t in x_new]
return dates_new, y_new
def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None):
"""
Plot prevalence estimates over time for each method contained in results_dict.
Parameters
----------
results_dict : dict
A dictionary where:
- "date-start" : list of datetime-like objects
- all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg]
Only the first component (p_pos) will be plotted.
"""
dates = results_dict["date-start"]
# Create figure
plt.figure(figsize=(20, 10))
# Plot one line per method (except "date-start")
for method, values in results_dict.items():
if method == "date-start":
continue
# Extract first component from each prevalence array
target_component = [v[target_class]*100 for v in values]
dates_smooth, y_smooth = smooth_curve(dates, target_component)
if method=='true-prev':
line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-')
else:
line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--')
plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color())
# Axis labels
# plt.xlabel("Date")
plt.ylabel("% of "+target_label+" tweets")
# Rotate date labels for readability
plt.xticks(rotation=45)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5)
plt.grid(which='minor', linestyle=':', linewidth=0.3)
# Place the legend outside to the right
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
plt.tight_layout()
if savepath is not None:
os.makedirs(Path(savepath).parent, exist_ok=True)
plt.savefig(savepath)
else:
plt.show()
class HDxDensify(DistributionMatchingX):
def fit(self, X, y):
self.reductor = TruncatedSVD(n_components=5, random_state=0)
Xred = self.reductor.fit_transform(X)
return super().fit(Xred, y)
def predict(self, X):
Xred = self.reductor.transform(X)
return super().predict(Xred)
class QUnfoldWrapperDensify(QUnfoldWrapper):
def fit(self, X, y):
self.reductor = TruncatedSVD(n_components=5, random_state=0)
Xred = self.reductor.fit_transform(X)
return super().fit(Xred, y)
def predict(self, X):
Xred = self.reductor.transform(X)
return super().predict(Xred)
# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
class HFTextClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
self.pipe = pipeline("sentiment-analysis", model=model_name)
self.classes_ = np.asarray([0,1])
def fit(self, X, y=None):
return self
def _binary_decisions(self, transformer_output: List[Dict]):
return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)
def predict(self, X):
X = list(map(str, X))
preds = self.pipe(X, truncation=True)
return self._binary_decisions(preds)
def predict_proba(self, X):
X = list(map(str, X))
n_examples = len(X)
preds = self.pipe(X, truncation=True)
decisions = self._binary_decisions(preds)
scores = np.array([p['score'] for p in preds], dtype=float)
probas = np.zeros(shape=(len(X), 2), dtype=float)
probas[np.arange(n_examples),decisions] = scores
probas[np.arange(n_examples),~decisions] = 1-scores
return probas
# def methods(pre_trained_classifier):
# yield 'CC', CC(pre_trained_classifier, fit_classifier=False)
USE_LOGISTIC_REGRESSION = True
if USE_LOGISTIC_REGRESSION:
new_classifier = lambda:LR()
to_fit = True
else:
pretrained = HFTextClassifier()
new_classifier = lambda:pretrained
to_fit = False
def methods():
yield 'CC', CC(new_classifier(), fit_classifier=to_fit)
yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit)
yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit)
yield 'HDx', HDxDensify()
yield 'KMM', QUnfoldWrapperDensify(KMM())
yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit)
yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit)
train, tests, test_init = prepare_labelled_collections()
if USE_LOGISTIC_REGRESSION:
# vectorize text for logistic regression
vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True)
Xtr = vectorizer.fit_transform(train.X)
train = LabelledCollection(Xtr, train.labels, train.classes_)
for i in range(len(tests)):
Xte = vectorizer.transform(tests[i].X)
tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_)
results = defaultdict(list)
for test_i, test_init_i in zip(tests, test_init):
results['true-prev'].append(test_i.prevalence())
results['date-start'].append(test_init_i[0])
for q_name, quant in methods():
quant.fit(*train.Xy)
for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)):
pred_i = quant.predict(test_i.X)
results[q_name].append(pred_i)
suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer'
plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf')

View File

@ -0,0 +1,73 @@
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
palette = itertools.cycle(sns.color_palette())
def setframe():
fig.spines['top'].set_visible(False)
fig.spines['left'].set_visible(False)
fig.get_yaxis().set_ticks([])
fig.spines['right'].set_visible(False)
# fig.axis('off')
nbins = 50
figsize = (5, 2)
ymax = 0.2
negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000)
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
plt.figure(figsize=figsize)
plt.xlim(0, 1)
plt.ylim(0, ymax)
fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
plt.title('Negative distribution')
fig.set(yticklabels=[])
fig.set(ylabel=None)
setframe()
# fig.get_figure().savefig('plots_cacm/negatives.pdf')
# plt.clf()
# -------------------------------------------------------------
positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000)
positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1)
positives = np.concatenate([positives1, positives2])
np.random.shuffle(positives)
positives = np.asarray([x for x in positives if 0 <= x <= 1])
# plt.figure(figsize=figsize)
plt.xlim(0, 1)
plt.ylim(0, ymax)
fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
plt.title('')
fig.set(yticklabels=[])
fig.set(ylabel=None)
setframe()
fig.get_figure().savefig('plots_cacm/training.pdf')
# -------------------------------------------------------------
prev = 0.2
test = np.concatenate([
negatives[:int(len(negatives)*(1-prev))],
positives[:int(len(positives)*(prev))],
])
plt.figure(figsize=figsize)
plt.xlim(0, 1)
plt.ylim(0, ymax)
fig = sns.histplot(data=test, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
plt.title('')
fig.set(yticklabels=[])
fig.set(ylabel=None)
setframe()
fig.get_figure().savefig('plots_cacm/test.pdf')

View File

@ -0,0 +1,86 @@
from copy import deepcopy
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from method.non_aggregative import DMx
from protocol import APP
from quapy.method.aggregative import CC, ACC, DMy
from sklearn.svm import LinearSVC
qp.environ['SAMPLE_SIZE'] = 100
DATASETS = qp.datasets.UCI_DATASETS[10:]
def fit_eval_task(args):
model_name, model, train, test = args
with qp.util.temp_seed(0):
model = deepcopy(model)
model.fit(train)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
return model_name, true_prev, estim_prev
def gen_data():
def base_classifier():
return LogisticRegression()
#return LinearSVC(class_weight='balanced')
def models():
yield 'CC', CC(base_classifier())
yield 'ACC', ACC(base_classifier())
yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1)
yield 'HDx', DMx(nbins=10, n_jobs=-1)
# train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
for dataset_name in DATASETS:
train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
print(dataset_name, train.X.shape)
outs = qp.util.parallel(
fit_eval_task,
((method_name, model, train, test) for method_name, model in models()),
seed=0,
n_jobs=-1
)
for method_name, true_prev, estim_prev in outs:
method_names.append(method_name)
true_prevs.append(true_prev)
estim_prevs.append(estim_prev)
tr_prevs.append(train.prevalence())
return method_names, true_prevs, estim_prevs, tr_prevs
method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data)
def remove_dataset(dataset_order, num_methods=4):
sel_names, sel_true, sel_estim, sel_tr = [],[],[],[]
for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)):
dataset_pos = i//num_methods
if dataset_pos not in dataset_order:
sel_names.append(name)
sel_true.append(true)
sel_estim.append(estim)
sel_tr.append(tr)
return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr)
print(DATASETS)
selected = 10
for i in [selected]:
print(i, DATASETS[i])
all_ = set(range(len(DATASETS)))
remove_index = sorted(all_ - {i})
sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4)
p=sel_tr[0][1]
sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names]
# qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png')
qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="")
# qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
# qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')

View File

@ -0,0 +1,62 @@
import math
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import numpy as np
from data import LabelledCollection
scale = 100
import quapy as qp
negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000)
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
positives = np.random.normal(loc = 0.75, scale=0.05, size=20000)
positives = np.asarray([x for x in positives if 0 <= x <= 1])
prev = 0.1
test = np.concatenate([
negatives[:int(len(negatives)*(1-prev))],
positives[:int(len(positives)*(prev))],
])
nbins = 30
plt.rcParams.update({'font.size': 7})
fig = plt.figure()
positions = np.asarray([2,1,0])
colors = ['r', 'g', 'b']
ax = fig.add_subplot(111, projection='3d')
ax.set_box_aspect((3, 1, 0.8))
for post, c, z in zip([test, positives, negatives], colors, positions):
hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True)
xs = (bins[:-1] + bins[1:])/2
ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
ax.yaxis.set_ticks(positions)
ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution'])
# ax.xaxis.set_ticks([])
# ax.xaxis.set_ticklabels([], minor=True)
ax.zaxis.set_ticks([])
ax.zaxis.set_ticklabels([], minor=True)
#plt.figure(figsize=(10,6))
#plt.show()
plt.savefig('./histograms3d_CACM2023.pdf')

View File

@ -0,0 +1,63 @@
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
import quapy as qp
from data import LabelledCollection
from method.non_aggregative import DMx
from protocol import APP
from quapy.method.aggregative import CC, DMy, ACC, EMQ
from sklearn.svm import LinearSVC
import numpy as np
from tqdm import tqdm
qp.environ['SAMPLE_SIZE'] = 500
def cls():
return LogisticRegressionCV(n_jobs=-1,Cs=10)
# return LogisticRegression(C=.1)
def gen_methods():
yield CC(cls()), r'CC$_{10' + r'\%}$'
yield ACC(cls()), 'ACC'
yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy'
yield DMx(nbins=10, n_jobs=-1), 'HDx'
yield EMQ(cls()), 'SLD'
# yield EMQ(cls(), calib='vs'), 'SLD-VS'
def gen_data():
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
method_data = []
training_prevalence = 0.1
training_size = 5000
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0)
# train_sample = train
for model, method_name in tqdm(gen_methods(), total=4):
with qp.util.temp_seed(1):
if method_name == 'HDx':
X, y = train_sample.Xy
svd = TruncatedSVD(n_components=5, random_state=0)
Xred = svd.fit_transform(X)
train_sample_dense = LabelledCollection(Xred, y)
X, y = test.Xy
test_dense = LabelledCollection(svd.transform(X), y)
model.fit(*train_sample_dense.Xy)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0))
else:
model.fit(*train_sample.Xy)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
return zip(*method_data)
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
# qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_ieee/bin_diag_4methods.pdf')
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_ieee/err_drift_4methods.pdf', title='', show_density=False, show_std=True)

View File

@ -0,0 +1,40 @@
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
import quapy as qp
from protocol import APP
from quapy.method.aggregative import CC
from sklearn.svm import LinearSVC
import numpy as np
from tqdm import tqdm
qp.environ['SAMPLE_SIZE'] = 500
def gen_data():
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
method_data = []
for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9):
training_size = 5000
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
train_sample = train.sampling(training_size, 1-training_prevalence)
# cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1)
# cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1)
# cls.fit(*train_sample.Xy)
model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10))
model.fit(train_sample)
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
return zip(*method_data)
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf')
# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False)

View File

@ -0,0 +1,126 @@
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def plot_kde_background(ax, data, cmap="Blues", alpha=0.35, gridsize=200):
"""
data: array Nx2
"""
# KDE
kde = gaussian_kde(data.T)
# Grid for evaluation
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
X, Y = np.meshgrid(
np.linspace(x_min, x_max, gridsize),
np.linspace(y_min, y_max, gridsize)
)
Z = kde(np.vstack([X.ravel(), Y.ravel()])).reshape(X.shape)
# Draw background density
ax.contourf(X, Y, Z, levels=30, cmap=cmap, alpha=alpha)
# ======================================================
# Define 3 Gaussian sources in 2D
# ======================================================
# Means
mu1 = np.array([0, 0]) # negative
mu2 = np.array([3, 0]) # positive
mu3 = np.array([0, 3]) # positive
# Covariances
Sigma = np.array([[1, 0.2],
[0.2, 1]])
def sample_gaussian(mu, Sigma, n):
return np.random.multivariate_normal(mu, Sigma, n)
# ======================================================
# Generate datasets for the 4 scenarios
# ======================================================
density = 20
# ---------- Scenario 1: Baseline ----------
G1_1 = sample_gaussian(mu1, Sigma, 100*density)
G2_1 = sample_gaussian(mu2, Sigma, 100*density)
G3_1 = sample_gaussian(mu3, Sigma, 100*density)
# ---------- Scenario 2: Prior Probability Shift ----------
G1_2 = sample_gaussian(mu1, Sigma, 300*density)
G2_2 = sample_gaussian(mu2, Sigma, 50*density)
G3_2 = sample_gaussian(mu3, Sigma, 50*density)
# ---------- Scenario 3: Covariate Shift ----------
# same class proportions but G3 moves (X-shift)
mu3_shift = mu3 + np.array([1.5, 0])
G1_3 = sample_gaussian(mu1, Sigma, 100*density)
G2_3 = sample_gaussian(mu2, Sigma, 100*density)
G3_3 = sample_gaussian(mu3_shift, Sigma, 100*density) # shifted covariates
# ---------- Scenario 4: Concept Shift ----------
# same data as Scenario 1, but G3 becomes negative
G1_4 = G1_1
G2_4 = G2_1
G3_4 = G3_1 # but will be colored as negative
# ======================================================
# Plotting function for each scenario
# ======================================================
def plot_scenario(ax, G1, G2, G3, title, G3_negative=False):
# plot_kde_background(ax, G1, cmap="Reds", alpha=0.75)
# plot_kde_background(ax, G2, cmap="Blues", alpha=0.75)
# plot_kde_background(ax, G3, cmap="Greens", alpha=0.75)
ax.scatter(G1[:, 0], G1[:, 1], s=12, color='red', alpha=0.1, label='Negative ($\ominus$)')
ax.scatter(G2[:, 0], G2[:, 1], s=12, color='blue', alpha=0.1, label='Positive ($\oplus$)')
if G3_negative:
ax.scatter(G3[:, 0], G3[:, 1], s=12, color='red', alpha=0.1) #, label='Negative ($\ominus$)')
else:
ax.scatter(G3[:, 0], G3[:, 1], s=12, color='blue', alpha=0.1) #, label='Positive ($\oplus$)')
ax.set_title(title)
ax.set_xlabel("$x_1$")
ax.set_ylabel("$x_2$")
ax.set_xticks([])
ax.set_yticks([])
ax.grid(alpha=0.3)
# ======================================================
# Generate 2×2 grid of subplots
# ======================================================
fig, axes = plt.subplots(2, 2, figsize=(9, 9))
plot_scenario(axes[0, 0], G1_1, G2_1, G3_1,
"Training data")
plot_scenario(axes[0, 1], G1_2, G2_2, G3_2,
"Prior Probability Shift")
plot_scenario(axes[1, 0], G1_3, G2_3, G3_3,
"Covariate Shift",
G3_negative=False)
plot_scenario(axes[1, 1], G1_4, G2_4, G3_4,
"Concept Shift",
G3_negative=True)
# One global legend
handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', ncol=3, fontsize=12)
plt.tight_layout(rect=[0, 0, 1, 0.95])
# plt.show()
plt.savefig('dataset_shift_types.pdf')

View File

@ -0,0 +1,97 @@
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from quapy.functional import uniform_prevalence_sampling
# Vertices of a regular tetrahedron
v1 = np.array([0, 0, 0])
v2 = np.array([1, 0, 0])
v3 = np.array([0.5, np.sqrt(3)/2, 0])
v4 = np.array([0.5, np.sqrt(3)/6, np.sqrt(6)/3])
vertices = np.array([v1, v2, v3, v4])
# Function to map (p1,p2,p3,p4) to 3D coordinates
def prob_to_xyz(p):
return p[0]*v1 + p[1]*v2 + p[2]*v3 + p[3]*v4
# --- Example: 5 random distributions inside the simplex
rand_probs = uniform_prevalence_sampling(n_classes=4, size=3000)
points_xyz = np.array([prob_to_xyz(p) for p in rand_probs])
# --- Plotting
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
# Draw tetrahedron faces
faces = [
[v1, v2, v3],
[v1, v2, v4],
[v1, v3, v4],
[v2, v3, v4]
]
poly = Poly3DCollection(faces, alpha=0.15, edgecolor='k', facecolor=None)
# poly = Poly3DCollection(faces, alpha=0.15, edgecolor='k', facecolor=None)
ax.add_collection3d(poly)
edges = [
[v1, v2],
[v1, v3],
[v1, v4],
[v2, v3],
[v2, v4],
[v3, v4]
]
for edge in edges:
xs, ys, zs = zip(*edge)
ax.plot(xs, ys, zs, color='black', linewidth=1)
# Draw vertices
# ax.scatter(vertices[:,0], vertices[:,1], vertices[:,2], s=60, color='red')
# Labels
offset = 0.08
labels = ["$y_1$", "$y_2$", "$y_3$", "$y_4$"]
for i, v in enumerate(vertices):
direction = v / np.linalg.norm(v)
label_pos = v + offset * direction
ax.text(label_pos[0], label_pos[1], label_pos[2], labels[i], fontsize=14, color='black')
# Plot random points
ax.scatter(points_xyz[:,0], points_xyz[:,1], points_xyz[:,2], s=10, c='blue', alpha=0.2)
# Axes formatting
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
ax.set_title("4-Class Probability Simplex (Tetrahedron)")
# ax.view_init(elev=65, azim=20)
ax.set_xticks([])
ax.set_yticks([])
ax.set_zticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])
ax.xaxis.set_ticks_position('none') # evita que dibuje marcas
ax.yaxis.set_ticks_position('none')
ax.zaxis.set_ticks_position('none')
ax.xaxis.pane.set_visible(False)
ax.yaxis.pane.set_visible(False)
ax.zaxis.pane.set_visible(False)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_zlabel('')
ax.grid(False)
plt.tight_layout()
# plt.show()
plt.savefig('plots_ieee/tetrahedron.pdf')

56
KDEyAitchison/commons.py Normal file
View File

@ -0,0 +1,56 @@
import numpy as np
import pandas as pd
from quapy.method.aggregative import EMQ, KDEyML, PACC
from sklearn.linear_model import LogisticRegression
METHODS = ['PACC',
'EMQ',
'KDEy-ML',
'KDEy-MLA'
]
# common hyperparameterss
hyper_LR = {
'classifier__C': np.logspace(-3, 3, 7),
'classifier__class_weight': ['balanced', None]
}
hyper_kde = {
'bandwidth': np.linspace(0.001, 0.5, 100)
}
hyper_kde_aitchison = {
'bandwidth': np.linspace(0.01, 2, 100)
}
# instances a new quantifier based on a string name
def new_method(method, **lr_kwargs):
lr = LogisticRegression(**lr_kwargs)
if method == 'KDEy-ML':
param_grid = {**hyper_kde, **hyper_LR}
quantifier = KDEyML(lr, kernel='gaussian')
elif method == 'KDEy-MLA':
param_grid = {**hyper_kde_aitchison, **hyper_LR}
quantifier = KDEyML(lr, kernel='aitchison')
elif method == 'EMQ':
param_grid = hyper_LR
quantifier = EMQ(lr)
elif method == 'PACC':
param_grid = hyper_LR
quantifier = PACC(lr)
else:
raise NotImplementedError('unknown method', method)
return param_grid, quantifier
def show_results(result_path):
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -0,0 +1,38 @@
import pickle
import os
import sys
import pandas as pd
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from commons import METHODS, new_method, show_results
from new_table import LatexTable
SEED = 1
if __name__ == '__main__':
print(qp.datasets.UCI_MULTICLASS_DATASETS)
for optim in ['mae', 'mrae']:
table = LatexTable()
result_dir = f'results/ucimulti/{optim}'
for method in METHODS:
print()
global_result_path = f'{result_dir}/{method}'
print(f'Method\tDataset\tMAE\tMRAE\tKLD')
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
# print(dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'):
report = pd.read_csv(local_result_path+'.dataframe')
print(f'{method}\t{dataset}\t{report[optim].mean():.5f}')
table.add(benchmark=dataset, method=method, v=report[optim].values)
else:
print(dataset, 'not found for method', method)
table.latexPDF(f'./tables/{optim}.pdf', landscape=False)

View File

@ -0,0 +1,94 @@
import pickle
import os
import sys
import pandas as pd
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from commons import METHODS, new_method, show_results
SEED = 1
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
for optim in ['mae', 'mrae']:
result_dir = f'results/ucimulti/{optim}'
os.makedirs(result_dir, exist_ok=True)
for method in METHODS:
print('Init method', method)
global_result_path = f'{result_dir}/{method}'
# show_results(global_result_path)
# sys.exit(0)
if not os.path.exists(global_result_path + '.csv'):
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print('init', dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping')
report = pd.read_csv(local_result_path+'.dataframe')
print(report["mae"].mean())
# data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
# csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
continue
with qp.util.temp_seed(SEED):
param_grid, quantifier = new_method(method, max_iter=3000)
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
)
try:
modsel.fit(*train.Xy)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(*train.Xy)
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
)
report.to_csv(f'{local_result_path}.dataframe')
print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -604,7 +604,10 @@ estim_prevalence = model.predict(dataset.test.X)
_(New in v0.2.0!)_ Some quantification methods go beyond providing a single point estimate of class prevalence values and also produce confidence regions, which characterize the uncertainty around the point estimate. In QuaPy, two such methods are currently implemented:
* Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. Key features of this method include:
* Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. The method is described in the paper [Moreo, A., Salvati, N.
An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification.
Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025.
pp 12-33, Porto (Portugal)](https://lq-2025.github.io/proceedings/CompleteVolume.pdf). Key features of this method include:
* Optimized Computation: The bootstrap is applied to pre-classified instances, significantly speeding up training and inference.
During training, bootstrap repetitions are performed only after training the classifier once. These repetitions are used to train multiple aggregation functions.

View File

@ -60,6 +60,14 @@ quapy.method.composable module
:undoc-members:
:show-inheritance:
quapy.method.confidence module
------------------------------
.. automodule:: quapy.method.confidence
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------

View File

@ -36,7 +36,7 @@ with qp.util.temp_seed(0):
true_prev = shifted_test.prevalence()
# by calling "quantify_conf", we obtain the point estimate and the confidence intervals around it
pred_prev, conf_intervals = pacc.quantify_conf(shifted_test.X)
pred_prev, conf_intervals = pacc.predict_conf(shifted_test.X)
# conf_intervals is an instance of ConfidenceRegionABC, which provides some useful utilities like:
# - coverage: a function which computes the fraction of true values that belong to the confidence region

View File

@ -0,0 +1,60 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import quapy as qp
from quapy.method.non_aggregative import ReadMe
import quapy.functional as F
from sklearn.pipeline import Pipeline
"""
This example showcases how to use the non-aggregative method ReadMe proposed by Hopkins and King.
This method is for text analysis, so let us first instantiate a dataset for sentiment quantification (we
use IMDb for this example). The method is quite computationally expensive, so we will restrict the training
set to 1000 documents only.
"""
reviews = qp.datasets.fetch_reviews('imdb').reduce(n_train=1000, random_state=0)
"""
We need to convert text to bag-of-words representations. Actually, ReadMe requires the representations to be
binary (i.e., storing a 1 whenever a document contains certain word, or 0 otherwise), so we will not use
TFIDF weighting. We will also retain the top 1000 most important features according to chi2.
"""
encode_0_1 = Pipeline([
('0_1_terms', CountVectorizer(min_df=5, binary=True)),
('feat_sel', SelectKBest(chi2, k=1000))
])
train, test = qp.data.preprocessing.instance_transformation(reviews, encode_0_1, inplace=True).train_test
"""
We now instantiate ReadMe, with the prob_model='full' (default behaviour, implementing the Hopkins and King original
idea). This method consists of estimating Q(Y) by solving:
Q(X) = \sum_i Q(X|Y=i) Q(Y=i)
without resorting to estimating the posteriors Q(Y=i|X), by solving a linear least-squares problem.
However, since Q(X) and Q(X|Y=i) are matrices of shape (2^K, 1) and (2^K, n), with K the number of features
and n the number of classes, their calculation becomes intractable. ReadMe instead performs bagging (i.e., it
samples small sets of features and averages the results) thus reducing K to a few terms. In our example we
set K (bagging_range) to 20, and the number of bagging_trials to 100.
ReadMe also computes confidence intervals via bootstrap. We set the number of bootstrap trials to 100.
"""
readme = ReadMe(prob_model='full', bootstrap_trials=100, bagging_trials=100, bagging_range=20, random_state=0, verbose=True)
readme.fit(*train.Xy) # <- there is actually nothing happening here (only bootstrap resampling); the method is "lazy"
# and postpones most of the calculations to the test phase.
# since the method is slow, we will only test 3 cases with different imbalances
few_negatives = [0.25, 0.75]
balanced = [0.5, 0.5]
few_positives = [0.75, 0.25]
for test_prev in [few_negatives, balanced, few_positives]:
sample = reviews.test.sampling(500, *test_prev, random_state=0) # draw sets of 500 documents with desired prevs
prev_estim, conf = readme.predict_conf(sample.X)
err = qp.error.mae(sample.prevalence(), prev_estim)
print(f'true-prevalence={F.strprev(sample.prevalence())},\n'
f'predicted-prevalence={F.strprev(prev_estim)}, with confidence intervals {conf},\n'
f'MAE={err:.4f}')

View File

@ -0,0 +1,254 @@
from scipy.sparse import csc_matrix, csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
import numpy as np
from joblib import Parallel, delayed
import sklearn
import math
from scipy.stats import t
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def squared_information_gain(cell):
return information_gain(cell)**2
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def pos_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0
else:
return information_gain(cell)
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC = len(cell_matrix)
nF = len(cell_matrix[0])
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(
delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)
)
return np.array(cell_matrix)
class TSRweighting(BaseEstimator,TransformerMixin):
"""
Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain,
chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for
each category-feature pair.
The supervised_4cell_matrix is a `(n_classes, n_words)` matrix containing the 4-cell contingency tables
for each class-word pair, and can be pre-computed (e.g., during the feature selection phase) and passed as an
argument.
When `n_classes>1`, i.e., in multiclass scenarios, a global_policy is used in order to determine a
single feature-score which informs about its relevance. Accepted policies include "max" (takes the max score
across categories), "ave" and "wave" (take the average, or weighted average, across all categories -- weights
correspond to the class prevalence), and "sum" (which sums all category scores).
"""
def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1):
if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}')
self.tsr_function = tsr_function
self.global_policy = global_policy
self.supervised_4cell_matrix = supervised_4cell_matrix
self.sublinear_tf = sublinear_tf
self.norm = norm
self.min_df = min_df
self.n_jobs = n_jobs
def fit(self, X, y):
self.count_vectorizer = CountVectorizer(min_df=self.min_df)
X = self.count_vectorizer.fit_transform(X)
self.tf_vectorizer = TfidfTransformer(
norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf
).fit(X)
if len(y.shape) == 1:
y = np.expand_dims(y, axis=1)
nD, nC = y.shape
nF = len(self.tf_vectorizer.get_feature_names_out())
if self.supervised_4cell_matrix is None:
self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs)
else:
if self.supervised_4cell_matrix.shape != (nC, nF):
raise ValueError("Shape of supervised information matrix is inconsistent with X and y")
tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function)
if self.global_policy == 'ave':
self.global_tsr_vector = np.average(tsr_matrix, axis=0)
elif self.global_policy == 'wave':
category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)]
self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences)
elif self.global_policy == 'sum':
self.global_tsr_vector = np.sum(tsr_matrix, axis=0)
elif self.global_policy == 'max':
self.global_tsr_vector = np.amax(tsr_matrix, axis=0)
return self
def fit_transform(self, X, y):
return self.fit(X,y).transform(X)
def transform(self, X):
if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.')
X = self.count_vectorizer.transform(X)
tf_X = self.tf_vectorizer.transform(X).toarray()
weighted_X = np.multiply(tf_X, self.global_tsr_vector)
if self.norm is not None and self.norm!='none':
weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False)
return csr_matrix(weighted_X)

View File

@ -0,0 +1,208 @@
from scipy.sparse import issparse
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import quapy as qp
from data import LabelledCollection
import numpy as np
from experimental_non_aggregative.custom_vectorizers import *
from method._kdey import KDEBase
from protocol import APP
from quapy.method.aggregative import HDy, DistributionMatchingY
from quapy.method.base import BaseQuantifier
from scipy import optimize
import pandas as pd
import quapy.functional as F
# TODO: explore the bernoulli (term presence/absence) variant
# TODO: explore the multinomial (term frequency) variant
# TODO: explore the multinomial + length normalization variant
# TODO: consolidate the TSR-variant (e.g., using information gain) variant;
# - works better with the idf?
# - works better with length normalization?
# - etc
class DxS(BaseQuantifier):
def __init__(self, vectorizer=None, divergence='topsoe'):
self.vectorizer = vectorizer
self.divergence = divergence
# def __as_distribution(self, instances):
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
def __as_distribution(self, instances):
dist = instances.mean(axis=0)
return np.asarray(dist).flatten()
def fit(self, text_instances, labels):
classes = np.unique(labels)
if self.vectorizer is not None:
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
distributions = []
for class_i in classes:
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
self.validation_distribution = np.asarray(distributions)
return self
def predict(self, text_instances):
if self.vectorizer is not None:
text_instances = self.vectorizer.transform(text_instances)
test_distribution = self.__as_distribution(text_instances)
divergence = qp.functional.get_divergence(self.divergence)
n_classes, n_feats = self.validation_distribution.shape
def match(prev):
prev = np.expand_dims(prev, axis=0)
mixture_distribution = (prev @ self.validation_distribution).flatten()
return divergence(test_distribution, mixture_distribution)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
class KDExML(BaseQuantifier, KDEBase):
def __init__(self, bandwidth=0.1, standardize=False):
self._check_bandwidth(bandwidth)
self.bandwidth = bandwidth
self.standardize = standardize
def fit(self, X, y):
classes = sorted(np.unique(y))
if self.standardize:
self.scaler = StandardScaler()
X = self.scaler.fit_transform(X)
if issparse(X):
X = X.toarray()
self.mix_densities = self.get_mixture_components(X, y, classes, self.bandwidth)
return self
def predict(self, X):
"""
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
of the data (i.e., that minimizes the negative log-likelihood)
:param X: instances in the sample
:return: a vector of class prevalence estimates
"""
epsilon = 1e-10
if issparse(X):
X = X.toarray()
n_classes = len(self.mix_densities)
if self.standardize:
X = self.scaler.transform(X)
test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities]
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return F.optim_minimize(neg_loglikelihood, n_classes)
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 250
qp.environ['N_JOBS'] = -1
min_df = 10
# dataset = 'imdb'
repeats = 10
error = 'mae'
div = 'topsoe'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS:
data = qp.datasets.fetch_reviews(dataset, tfidf=False)
# bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True)
# dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer)
# yield data, dxs, 'DxS-Bernoulli'
#
# multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False)
# dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer)
# yield data, dxs, 'DxS-multinomial'
#
# tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None)
# dxs = DxS(divergence=div, vectorizer=tf_vectorizer)
# yield data, dxs, 'DxS-TF'
#
# logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None)
# dxs = DxS(divergence=div, vectorizer=logtf_vectorizer)
# yield data, dxs, 'DxS-logTF'
#
# tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None)
# dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
# yield data, dxs, 'DxS-TFIDF'
#
# tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2')
# dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
# yield data, dxs, 'DxS-TFIDF-l2'
tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2')
dxs = DxS(divergence=div, vectorizer=tsr_vectorizer)
yield data, dxs, 'DxS-TFTSR-l2'
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df)
kdex = KDExML()
reduction = TruncatedSVD(n_components=100, random_state=0)
red_data = qp.data.preprocessing.instance_transformation(data, transformer=reduction, inplace=False)
yield red_data, kdex, 'KDEx'
hdy = HDy(LogisticRegression())
yield data, hdy, 'HDy'
# dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5)
# yield data, dm, 'DM-5b'
#
# dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10)
# yield data, dm, 'DM-10b'
result_path = 'results.csv'
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(*data.training.Xy)
report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True)
means = report.mean(numeric_only=True)
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
df = pd.read_csv(result_path, sep='\t')
# print(df)
pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"])
print(pv)

View File

@ -13,7 +13,7 @@ from . import model_selection
from . import classification
import os
__version__ = '0.2.0'
__version__ = '0.2.1'
def _default_cls():

View File

@ -33,7 +33,6 @@ class LabelledCollection:
else:
self.instances = np.asarray(instances)
self.labels = np.asarray(labels)
n_docs = len(self)
if classes is None:
self.classes_ = F.classes_from_labels(self.labels)
else:
@ -41,7 +40,13 @@ class LabelledCollection:
self.classes_.sort()
if len(set(self.labels).difference(set(classes))) > 0:
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})')
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
self._index = None
@property
def index(self):
if not hasattr(self, '_index') or self._index is None:
self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_}
return self._index
@classmethod
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):

View File

@ -10,6 +10,37 @@ from quapy.util import map_parallel
from .base import LabelledCollection
def instance_transformation(dataset:Dataset, transformer, inplace=False):
"""
Transforms a :class:`quapy.data.base.Dataset` applying the `fit_transform` and `transform` functions
of a (sklearn's) transformer.
:param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
lists of str
:param transformer: TransformerMixin implementing `fit_transform` and `transform` functions
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:return: a new :class:`quapy.data.base.Dataset` with transformed instances (if inplace=False) or a reference to the
current Dataset (if inplace=True) where the instances have been transformed
"""
training_transformed = transformer.fit_transform(*dataset.training.Xy)
test_transformed = transformer.transform(dataset.test.X)
orig_name = dataset.name
if inplace:
dataset.training = LabelledCollection(training_transformed, dataset.training.labels, dataset.classes_)
dataset.test = LabelledCollection(test_transformed, dataset.test.labels, dataset.classes_)
if hasattr(transformer, 'vocabulary_'):
dataset.vocabulary = transformer.vocabulary_
return dataset
else:
training = LabelledCollection(training_transformed, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(test_transformed, dataset.test.labels.copy(), dataset.classes_)
vocab = None
if hasattr(transformer, 'vocabulary_'):
vocab = transformer.vocabulary_
return Dataset(training, test, vocabulary=vocab, name=orig_name)
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
"""
Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
@ -29,18 +60,7 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
__check_type(dataset.test.instances, np.ndarray, str)
vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)
training_documents = vectorizer.fit_transform(dataset.training.instances)
test_documents = vectorizer.transform(dataset.test.instances)
if inplace:
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_)
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_)
dataset.vocabulary = vectorizer.vocabulary_
return dataset
else:
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test, vectorizer.vocabulary_)
return instance_transformation(dataset, vectorizer, inplace)
def reduce_columns(dataset: Dataset, min_df=5, inplace=False):

View File

@ -3,6 +3,7 @@
import numpy as np
from sklearn.metrics import f1_score
import quapy as qp
from functional import CLRtransformation
def from_name(err_name):
@ -128,6 +129,59 @@ def se(prevs_true, prevs_hat):
return ((prevs_hat - prevs_true) ** 2).mean(axis=-1)
def sre(prevs_true, prevs_hat, prevs_train, eps=0.):
"""
The squared ratio error is defined as
:math:`SRE(p,\\hat{p},p^{tr})=\\frac{1}{|\\mathcal{Y}|}\\sum_{i \\in \\mathcal{Y}}(w_i-\\hat{w}_i)^2`,
where
:math:`w_i=\\frac{p_i}{p^{tr}_i}=\\frac{Q(Y=i)}{P(Y=i)}`,
and `\\hat{w}_i` is the estimate obtained by replacing the true test prior with an estimate, and
:math:`\\mathcal{Y}` are the classes of interest
:param prevs_true: array-like, true prevalence values
:param prevs_hat: array-like, estimated prevalence values
:param prevs_train: array-like, training prevalence values
:param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the
training prevalence is expected to be >0 everywhere.
:return: the squared ratio error
"""
prevs_true = np.asarray(prevs_true)
prevs_hat = np.asarray(prevs_hat)
prevs_train = np.asarray(prevs_train)
assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape=} vs {prevs_hat.shape=}'
assert prevs_true.shape[-1]==prevs_train.shape[-1], 'wrong shape for training prevalence'
if eps>0:
prevs_true = smooth(prevs_true, eps)
prevs_hat = smooth(prevs_hat, eps)
prevs_train = smooth(prevs_train, eps)
N = prevs_true.shape[-1]
w = prevs_true / prevs_train
w_hat = prevs_hat / prevs_train
return (1./N) * (w - w_hat)**2.
def msre(prevs_true, prevs_hat, prevs_train, eps=0.):
"""
Returns the mean across all experiments. See :function:`sre`.
:param prevs_true: array-like, true prevalence values of shape (n_experiments, n_classes,) or (n_classes,)
:param prevs_hat: array-like, estimated prevalence values of shape equal to prevs_true
:param prevs_train: array-like, training prevalence values of (n_experiments, n_classes,) or (n_classes,)
:param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the
training prevalence is expected to be >0 everywhere.
:return: the squared ratio error
"""
return np.mean(squared_ratio_error(prevs_true, prevs_hat, prevs_train, eps))
def dist_aitchison(prevs_true, prevs_hat):
clr = CLRtransformation()
return np.linalg.norm(clr(prevs_true) - clr(prevs_hat), axis=-1)
def mean_dist_aitchison(prevs_true, prevs_hat):
return np.mean(dist_aitchison(prevs_true, prevs_hat))
def mkld(prevs_true, prevs_hat, eps=None):
"""Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the
sample pairs. The distributions are smoothed using the `eps` factor
@ -374,8 +428,8 @@ def __check_eps(eps=None):
CLASSIFICATION_ERROR = {f1e, acce}
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, msre, mean_dist_aitchison}
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, sre, dist_aitchison}
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
@ -387,6 +441,7 @@ ERROR_NAMES = \
f1_error = f1e
acc_error = acce
mean_absolute_error = mae
squared_ratio_error = sre
absolute_error = ae
mean_relative_absolute_error = mrae
relative_absolute_error = rae

View File

@ -1,10 +1,15 @@
import warnings
from abc import ABC, abstractmethod
from collections import defaultdict
from functools import lru_cache
from typing import Literal, Union, Callable
from numpy.typing import ArrayLike
import scipy
import numpy as np
from scipy.special import softmax
import quapy as qp
# ------------------------------------------------------------------------------------------
@ -583,8 +588,8 @@ def solve_adjustment(
"""
Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of
`unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of
:math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`.
:math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`.
:param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate
of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j`
@ -649,3 +654,96 @@ def solve_adjustment(
raise ValueError(f'unknown {solver=}')
# ------------------------------------------------------------------------------------------
# Transformations from Compositional analysis
# ------------------------------------------------------------------------------------------
class CompositionalTransformation(ABC):
"""
Abstract class of transformations from compositional data.
Basically, callable functions with an "inverse" function.
"""
@abstractmethod
def __call__(self, X): ...
@abstractmethod
def inverse(self, Z): ...
EPSILON=1e-6
class CLRtransformation(CompositionalTransformation):
"""
Centered log-ratio (CLR), from compositional analysis
"""
def __call__(self, X):
"""
Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
:param epsilon: small float for prevalence smoothing
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
"""
X = np.asarray(X)
X = qp.error.smooth(X, self.EPSILON)
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
return np.log(X / G)
def inverse(self, Z):
"""
Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
:param Z: np.ndarray of (n_instances, n_dimensions) to be transformed
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
"""
return scipy.special.softmax(Z, axis=-1)
class ILRtransformation(CompositionalTransformation):
"""
Isometric log-ratio (ILR), from compositional analysis
"""
def __call__(self, X):
X = np.asarray(X)
X = qp.error.smooth(X, self.EPSILON)
k = X.shape[-1]
V = self.get_V(k) # (k-1, k)
logp = np.log(X)
return logp @ V.T
def inverse(self, Z):
Z = np.asarray(Z)
# get dimension
k_minus_1 = Z.shape[-1]
k = k_minus_1 + 1
V = self.get_V(k) # (k-1, k)
logp = Z @ V
p = np.exp(logp)
p = p / np.sum(p, axis=-1, keepdims=True)
return p
@lru_cache(maxsize=None)
def get_V(self, k):
def helmert_matrix(k):
"""
Returns the (k x k) Helmert matrix.
"""
H = np.zeros((k, k))
for i in range(1, k):
H[i, :i] = 1
H[i, i] = -(i)
H[i] = H[i] / np.sqrt(i * (i + 1))
# row 0 stays zeros; will be discarded
return H
def ilr_basis(k):
"""
Constructs an orthonormal ILR basis using the Helmert submatrix.
Output shape: (k-1, k)
"""
H = helmert_matrix(k)
V = H[1:, :] # remove first row of zeros
return V
return ilr_basis(k)

View File

@ -29,6 +29,7 @@ AGGREGATIVE_METHODS = {
aggregative.KDEyHD,
# aggregative.OneVsAllAggregative,
confidence.BayesianCC,
confidence.PQ,
}
BINARY_METHODS = {
@ -40,6 +41,7 @@ BINARY_METHODS = {
aggregative.MAX,
aggregative.MS,
aggregative.MS2,
confidence.PQ,
}
MULTICLASS_METHODS = {

View File

@ -1,13 +1,21 @@
"""
Utility functions for `Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ methods.
"""
import contextlib
import os
import sys
import numpy as np
import importlib.resources
try:
import jax
import jax.numpy as jnp
import numpyro
import numpyro.distributions as dist
import stan
import logging
import stan.common
DEPENDENCIES_INSTALLED = True
except ImportError:
@ -15,6 +23,7 @@ except ImportError:
jnp = None
numpyro = None
dist = None
stan = None
DEPENDENCIES_INSTALLED = False
@ -77,3 +86,71 @@ def sample_posterior(
rng_key = jax.random.PRNGKey(seed)
mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled)
return mcmc.get_samples()
def load_stan_file():
return importlib.resources.files('quapy.method').joinpath('stan/pq.stan').read_text(encoding='utf-8')
@contextlib.contextmanager
def _suppress_stan_logging():
with open(os.devnull, "w") as devnull:
old_stderr = sys.stderr
sys.stderr = devnull
try:
yield
finally:
sys.stderr = old_stderr
def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed):
"""
Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification.
This function builds and samples from a Stan model that implements a bin-based Bayesian
quantifier. It uses the class-conditional histograms of the classifier
outputs for positive and negative examples, along with the test histogram, to estimate
the posterior distribution of prevalence in the test set.
Parameters
----------
stan_code : str
The Stan model code as a string.
n_bins : int
Number of bins used to build the histograms for positive and negative examples.
pos_hist : array-like of shape (n_bins,)
Histogram counts of the classifier outputs for the positive class.
neg_hist : array-like of shape (n_bins,)
Histogram counts of the classifier outputs for the negative class.
test_hist : array-like of shape (n_bins,)
Histogram counts of the classifier outputs for the test set, binned using the same bins.
number_of_samples : int
Number of post-warmup samples to draw from the Stan posterior.
num_warmup : int
Number of warmup iterations for the sampler.
stan_seed : int
Random seed for Stan model compilation and sampling, ensuring reproducibility.
Returns
-------
prev_samples : numpy.ndarray
An array of posterior samples of the prevalence (`prev`) in the test set.
Each element corresponds to one draw from the posterior distribution.
"""
logging.getLogger("stan.common").setLevel(logging.ERROR)
stan_data = {
'n_bucket': n_bins,
'train_neg': neg_hist.tolist(),
'train_pos': pos_hist.tolist(),
'test': test_hist.tolist(),
'posterior': 1
}
with _suppress_stan_logging():
stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed)
fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup)
return fit['prev']

View File

@ -15,9 +15,11 @@ class KDEBase:
"""
BANDWIDTH_METHOD = ['scott', 'silverman']
KERNELS = ['gaussian', 'aitchison', 'ilr']
@classmethod
def _check_bandwidth(cls, bandwidth):
def _check_bandwidth(cls, bandwidth, kernel):
"""
Checks that the bandwidth parameter is correct
@ -27,32 +29,56 @@ class KDEBase:
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, \
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
assert kernel!='gaussian' or (0 < bandwidth < 1), \
("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), "
"since this method models the unit simplex")
return bandwidth
def get_kde_function(self, X, bandwidth):
@classmethod
def _check_kernel(cls, kernel):
"""
Checks that the kernel parameter is correct
:param kernel: str
:return: the validated kernel
"""
assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
return kernel
def get_kde_function(self, X, bandwidth, kernel):
"""
Wraps the KDE function from scikit-learn.
:param X: data for which the density function is to be estimated
:param bandwidth: the bandwidth of the kernel
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
:return: a scikit-learn's KernelDensity object
"""
if kernel == 'aitchison':
X = self.clr_transform(X)
elif kernel == 'ilr':
X = self.ilr_transform(X)
return KernelDensity(bandwidth=bandwidth).fit(X)
def pdf(self, kde, X):
def pdf(self, kde, X, kernel):
"""
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
function returns :math:`e^{s}`
:param kde: a previously fit KDE function
:param X: the data for which the density is to be estimated
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
:return: np.ndarray with the densities
"""
if kernel == 'aitchison':
X = self.clr_transform(X)
elif kernel == 'ilr':
X = self.ilr_transform(X)
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, classes, bandwidth):
def get_mixture_components(self, X, y, classes, bandwidth, kernel):
"""
Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -60,6 +86,7 @@ class KDEBase:
:param y: the class labels
:param n_classes: integer, the number of classes
:param bandwidth: float, the bandwidth of the kernel
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
class_cond_X = []
@ -67,8 +94,19 @@ class KDEBase:
selX = X[y==cat]
if selX.size==0:
selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]
def clr_transform(self, X):
if not hasattr(self, 'clr'):
self.clr = F.CLRtransformation()
return self.clr(X)
def ilr_transform(self, X):
if not hasattr(self, 'ilr'):
self.ilr = F.ILRtransformation()
return self.ilr(X)
class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -107,17 +145,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
:param bandwidth: float, the bandwidth of the Kernel
:param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
:param random_state: a seed to be set before fitting any base quantifier (default None)
"""
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
random_state=None):
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
self.kernel = self._check_kernel(kernel)
self.random_state=random_state
def aggregation_fit(self, classif_predictions, labels):
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
return self
def aggregate(self, posteriors: np.ndarray):
@ -131,10 +171,11 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
with qp.util.temp_seed(self.random_state):
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
# test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_mixture_likelihood = prev @ test_densities
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
@ -191,7 +232,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
super().__init__(classifier, fit_classifier, val_split)
self.divergence = divergence
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
self.random_state=random_state
self.montecarlo_trials = montecarlo_trials
@ -278,7 +319,7 @@ class KDEyCS(AggregativeSoftQuantifier):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1):
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))

View File

@ -673,7 +673,7 @@ class PACC(AggregativeSoftQuantifier):
class EMQ(AggregativeSoftQuantifier):
"""
`Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ),
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm.
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm, or `Maximum Likelihood Label Shif` (MLLS).
EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
maximum-likelihood estimation, in a mutually recursive way, until convergence.

View File

@ -1,19 +1,21 @@
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix
import quapy as qp
import quapy.functional as F
from quapy.functional import CompositionalTransformation, CLRtransformation, ILRtransformation
from quapy.method import _bayesian
from quapy.method.aggregative import AggregativeCrispQuantifier
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier
from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier
from scipy.stats import chi2
from sklearn.utils import resample
from abc import ABC, abstractmethod
from scipy.special import softmax, factorial
from scipy.special import factorial
import copy
from functools import lru_cache
from tqdm import tqdm
"""
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
@ -80,6 +82,58 @@ class ConfidenceRegionABC(ABC):
proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
return proportion
@property
@abstractmethod
def samples(self):
""" Returns internal samples """
...
def __contains__(self, p):
"""
Overloads in operator, checks if `p` is contained in the region
:param p: array-like
:return: boolean
"""
p = np.asarray(p)
assert p.ndim==1, f'unexpected shape for point parameter'
return self.coverage(p)==1.
def closest_point_in_region(self, p, tol=1e-6, max_iter=30):
"""
Finds the closes point to p that belongs to the region. Assumes the region is convex.
:param p: array-like, the point
:param tol: float, error tolerance
:param max_iter: int, max number of iterations
:returns: array-like, the closes point to p in the segment between p and the center of the region, that
belongs to the region
"""
p = np.asarray(p, dtype=float)
# if p in region, returns p itself
if p in self:
return p.copy()
# center of the region
c = self.point_estimate()
# binary search in [0,1], interpolation parameter
# low=closest to p, high=closest to c
low, high = 0.0, 1.0
for _ in range(max_iter):
mid = 0.5 * (low + high)
x = p*(1-mid) + c*mid
if x in self:
high = mid
else:
low = mid
if high - low < tol:
break
in_boundary = p*(1-high) + c*high
return in_boundary
class WithConfidenceABC(ABC):
"""
@ -88,20 +142,32 @@ class WithConfidenceABC(ABC):
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
@abstractmethod
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
"""
Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
Adds the method `predict_conf` to the interface. This method returns not only the point-estimate, but
also the confidence region around it.
:param instances: a np.ndarray of shape (n_instances, n_features,)
:confidence_level: float in (0, 1)
:param confidence_level: float in (0, 1), default is 0.95
:return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
(n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
"""
...
def quantify_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
"""
Alias to `predict_conf`. This method returns not only the point-estimate, but
also the confidence region around it.
:param instances: a np.ndarray of shape (n_instances, n_features,)
:param confidence_level: float in (0, 1), default is 0.95
:return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
(n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
"""
return self.predict_conf(instances=instances, confidence_level=confidence_level)
@classmethod
def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'):
def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals')->ConfidenceRegionABC:
"""
Construct a confidence region given many prevalence estimations.
@ -136,7 +202,7 @@ def simplex_volume(n):
return 1 / factorial(n)
def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
def within_ellipse_prop__(values, mean, prec_matrix, chi2_critical):
"""
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
at a distance `chi2_critical`.
@ -169,110 +235,125 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
return within_elipse * 1.0
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
"""
Instantiates a Confidence Ellipse in the probability simplex.
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
at a distance `chi2_critical`.
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
:param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
:param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
covariance matrix) of the ellipse. If this inverse cannot be computed
then None must be passed
:param chi2_critical: float, the chi2 critical value
:return: float in [0,1], the fraction of values that are contained in the ellipse
defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
"""
if prec_matrix is None:
return 0.
values = np.atleast_2d(values)
diff = values - mean
d_M_squared = np.sum(diff @ prec_matrix * diff, axis=-1)
within_ellipse = d_M_squared <= chi2_critical
if len(within_ellipse) == 1:
return float(within_ellipse[0])
else:
return float(np.mean(within_ellipse))
def closest_point_on_ellipsoid(p, mean, cov, chi2_critical, tol=1e-9, max_iter=100):
"""
Computes the closest point on the ellipsoid defined by:
(x - mean)^T cov^{-1} (x - mean) = chi2_critical
"""
def __init__(self, X, confidence_level=0.95):
p = np.asarray(p)
mean = np.asarray(mean)
Sigma = np.asarray(cov)
assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)'
# Precompute precision matrix
P = np.linalg.pinv(Sigma)
d = P.shape[0]
X = np.asarray(X)
# Define v = p - mean
v = p - mean
self.mean_ = X.mean(axis=0)
self.cov_ = np.cov(X, rowvar=False, ddof=1)
# If p is inside the ellipsoid, return p itself
M_dist = v @ P @ v
if M_dist <= chi2_critical:
return p.copy()
try:
self.precision_matrix_ = np.linalg.inv(self.cov_)
except:
self.precision_matrix_ = None
# Function to compute x(lambda)
def x_lambda(lmbda):
A = np.eye(d) + lmbda * P
return mean + np.linalg.solve(A, v)
self.dim = X.shape[-1]
self.ddof = self.dim - 1
# Function whose root we want: f(lambda) = Mahalanobis distance - chi2
def f(lmbda):
x = x_lambda(lmbda)
diff = x - mean
return diff @ P @ diff - chi2_critical
# critical chi-square value
self.confidence_level = confidence_level
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
# Bisection search over lambda >= 0
l_low, l_high = 0.0, 1.0
def point_estimate(self):
"""
Returns the point estimate, the center of the ellipse.
# Increase high until f(high) < 0
while f(l_high) > 0:
l_high *= 2
if l_high > 1e12:
raise RuntimeError("Failed to bracket the root.")
:return: np.ndarray of shape (n_classes,)
"""
return self.mean_
# Bisection
for _ in range(max_iter):
l_mid = 0.5 * (l_low + l_high)
fm = f(l_mid)
if abs(fm) < tol:
break
if fm > 0:
l_low = l_mid
else:
l_high = l_mid
def coverage(self, true_value):
"""
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
fraction of these that are contained in the region, if more than one value is passed. If only one value is
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
:return: float in [0,1]
"""
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
class ConfidenceEllipseCLR(ConfidenceRegionABC):
"""
Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
"""
def __init__(self, X, confidence_level=0.95):
self.clr = CLRtransformation()
Z = self.clr(X)
self.mean_ = np.mean(X, axis=0)
self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
def point_estimate(self):
"""
Returns the point estimate, the center of the ellipse.
:return: np.ndarray of shape (n_classes,)
"""
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
# return self.clr.inverse(self.mean_) # <- does not coincide
return self.mean_
def coverage(self, true_value):
"""
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
fraction of these that are contained in the region, if more than one value is passed. If only one value is
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
:return: float in [0,1]
"""
transformed_values = self.clr(true_value)
return self.conf_region_clr.coverage(transformed_values)
l_opt = l_mid
return x_lambda(l_opt)
class ConfidenceIntervals(ConfidenceRegionABC):
"""
Instantiates a region based on (independent) Confidence Intervals.
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
:param bonferroni_correction: bool (default False), if True, a Bonferroni correction
is applied to the significance level (`alpha`) before computing confidence intervals.
The correction consists of replacing `alpha` with `alpha/n_classes`. When
`n_classes=2` the correction is not applied because there is only one verification test
since the other class is constrained. This is not necessarily true for n_classes>2.
"""
def __init__(self, X, confidence_level=0.95):
def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False):
assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
X = np.asarray(X)
samples = np.asarray(samples)
self.means_ = X.mean(axis=0)
self.means_ = samples.mean(axis=0)
alpha = 1-confidence_level
if bonferroni_correction:
n_classes = samples.shape[-1]
if n_classes>2:
alpha = alpha/n_classes
low_perc = (alpha/2.)*100
high_perc = (1-alpha/2.)*100
self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0)
self.I_low, self.I_high = np.percentile(samples, q=[low_perc, high_perc], axis=0)
self._samples = samples
self.alpha = alpha
@property
def samples(self):
return self._samples
def point_estimate(self):
"""
@ -297,33 +378,174 @@ class ConfidenceIntervals(ConfidenceRegionABC):
return proportion
def __repr__(self):
return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']'
class CLRtransformation:
@property
def n_dim(self):
return len(self.I_low)
def winkler_scores(self, true_prev):
true_prev = np.asarray(true_prev)
assert true_prev.ndim == 1, 'unexpected dimensionality for true_prev'
assert len(true_prev)==self.n_dim, \
f'unexpected number of dimensions; found {true_prev.ndim}, expected {self.n_dim}'
def winkler_score(low, high, true_val, alpha):
amp = high-low
scale_cost = 1./alpha
cost = np.max([0, low-true_val], axis=0) + np.max([0, true_val-high], axis=0)
return amp + scale_cost*cost
return np.asarray(
[winkler_score(low_i, high_i, true_v, self.alpha)
for (low_i, high_i, true_v) in zip(self.I_low, self.I_high, true_prev)]
)
def mean_winkler_score(self, true_prev):
return np.mean(self.winkler_scores(true_prev))
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
"""
Centered log-ratio, from component analysis
Instantiates a Confidence Ellipse in the probability simplex.
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
"""
def __call__(self, X, epsilon=1e-6):
"""
Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
:param epsilon: small float for prevalence smoothing
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
"""
X = np.asarray(X)
X = qp.error.smooth(X, epsilon)
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
return np.log(X / G)
def __init__(self, samples, confidence_level=0.95):
def inverse(self, X):
"""
Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)'
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
samples = np.asarray(samples)
self.mean_ = samples.mean(axis=0)
self.cov_ = np.cov(samples, rowvar=False, ddof=1)
try:
self.precision_matrix_ = np.linalg.pinv(self.cov_)
except:
self.precision_matrix_ = None
self.dim = samples.shape[-1]
self.ddof = self.dim - 1
# critical chi-square value
self.confidence_level = confidence_level
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
self._samples = samples
self.alpha = 1.-confidence_level
@property
def samples(self):
return self._samples
def point_estimate(self):
"""
return softmax(X, axis=-1)
Returns the point estimate, the center of the ellipse.
:return: np.ndarray of shape (n_classes,)
"""
return self.mean_
def coverage(self, true_value):
"""
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
fraction of these that are contained in the region, if more than one value is passed. If only one value is
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
:return: float in [0,1]
"""
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
def closest_point_in_region(self, p, tol=1e-6, max_iter=30):
return closest_point_on_ellipsoid(
p,
mean=self.mean_,
cov=self.cov_,
chi2_critical=self.chi2_critical_
)
class ConfidenceEllipseTransformed(ConfidenceRegionABC):
"""
Instantiates a Confidence Ellipse in a transformed space.
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
"""
def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95):
samples = np.asarray(samples)
self.transformation = transformation
Z = self.transformation(samples)
# self.mean_ = np.mean(samples, axis=0)
self.mean_ = self.transformation.inverse(np.mean(Z, axis=0))
self.conf_region_z = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
self._samples = samples
self.alpha = 1.-confidence_level
@property
def samples(self):
return self._samples
def point_estimate(self):
"""
Returns the point estimate, the center of the ellipse.
:return: np.ndarray of shape (n_classes,)
"""
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
# return self.clr.inverse(self.mean_) # <- does not coincide
return self.mean_
def coverage(self, true_value):
"""
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
fraction of these that are contained in the region, if more than one value is passed. If only one value is
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
:return: float in [0,1]
"""
transformed_values = self.transformation(true_value)
return self.conf_region_z.coverage(transformed_values)
def closest_point_in_region(self, p, tol=1e-6, max_iter=30):
p_prime = self.transformation(p)
b_prime = self.conf_region_z.closest_point_in_region(p_prime, tol=tol, max_iter=max_iter)
b = self.transformation.inverse(b_prime)
return b
class ConfidenceEllipseCLR(ConfidenceEllipseTransformed):
"""
Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
"""
def __init__(self, samples, confidence_level=0.95):
super().__init__(samples, CLRtransformation(), confidence_level=confidence_level)
class ConfidenceEllipseILR(ConfidenceEllipseTransformed):
"""
Instantiates a Confidence Ellipse in the Isometric-Log Ratio (CLR) space.
:param samples: np.ndarray of shape (n_bootstrap_samples, n_classes)
:param confidence_level: float, the confidence level (default 0.95)
"""
def __init__(self, samples, confidence_level=0.95):
super().__init__(samples, ILRtransformation(), confidence_level=confidence_level)
class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
@ -339,6 +561,12 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
During inference, the bootstrap repetitions are applied to the pre-classified test instances.
See
`Moreo, A., Salvati, N.
An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification.
Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025.
pp 12-33 <https://lq-2025.github.io/proceedings/CompleteVolume.pdf>`_
:param quantifier: an aggregative quantifier
:para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
model-based bootstrap approach)
@ -357,7 +585,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
n_test_samples=500,
confidence_level=0.95,
region='intervals',
random_state=None):
random_state=None,
verbose=False):
assert isinstance(quantifier, AggregativeQuantifier), \
f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
@ -374,6 +603,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
self.confidence_level = confidence_level
self.region = region
self.random_state = random_state
self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels):
data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
@ -399,6 +629,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
return prev_mean
def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
n_samples = classif_predictions.shape[0]
prevs = []
with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers:
for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
sample_i = resample(classif_predictions, n_samples=n_samples)
prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i)
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate()
return prev_estim, conf
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
@ -407,10 +655,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prevs = []
with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers:
for i in range(self.n_test_samples):
sample_i = resample(classif_predictions, n_samples=n_samples)
prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i)
results = Parallel(n_jobs=-1)(
delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples)
for i in range(self.n_test_samples)
)
prevs.extend(results)
# for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
# sample_i = resample(classif_predictions, n_samples=n_samples)
# prev_i = quantifier.aggregate(sample_i)
# prevs.append(prev_i)
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate()
@ -423,7 +676,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
self.aggregation_fit(classif_predictions, labels)
return self
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
predictions = self.quantifier.classify(instances)
return self.aggregate_conf(predictions, confidence_level=confidence_level)
@ -435,9 +688,16 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
return self.quantifier._classifier_method()
def bootstrap_once(i, classif_predictions, quantifier, n_samples):
idx = np.random.randint(0, len(classif_predictions), n_samples)
sample = classif_predictions[idx]
prev = quantifier.aggregate(sample)
return prev
class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
"""
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method,
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method (by Albert Ziegler and Paweł Czyż),
which is a variant of :class:`ACC` that calculates the posterior probability distribution
over the prevalence vectors, rather than providing a point estimate obtained
by matrix inversion.
@ -543,9 +803,115 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y]
return np.asarray(samples.mean(axis=0), dtype=float)
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
if confidence_level is None:
confidence_level = self.confidence_level
classif_predictions = self.classify(instances)
point_estimate = self.aggregate(classif_predictions)
samples = self.get_prevalence_samples() # available after calling "aggregate" function
region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region)
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
return point_estimate, region
class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
"""
`Precise Quantifier: Bayesian distribution matching quantifier <https://arxiv.org/abs/2507.06061>,
which is a variant of :class:`HDy` that calculates the posterior probability distribution
over the prevalence vectors, rather than providing a point estimate.
This method relies on extra dependencies, which have to be installed via:
`$ pip install quapy[bayes]`
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']`
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
None when the method does not require any validation data, in order to avoid that some portion of
the training data be wasted.
:param num_warmup: number of warmup iterations for the STAN sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000)
:param stan_seed: random seed for the STAN sampler (default 0)
:param region: string, set to `intervals` for constructing confidence intervals (default), or to
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
"""
def __init__(self,
classifier: BaseEstimator=None,
fit_classifier=True,
val_split: int = 5,
nbins: int = 4,
fixed_bins: bool = False,
num_warmup: int = 500,
num_samples: int = 1_000,
stan_seed: int = 0,
confidence_level: float = 0.95,
region: str = 'intervals'):
if num_warmup <= 0:
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer')
if not _bayesian.DEPENDENCIES_INSTALLED:
raise ImportError("Auxiliary dependencies are required. "
"Run `$ pip install quapy[bayes]` to install them.")
super().__init__(classifier, fit_classifier, val_split)
self.nbins = nbins
self.fixed_bins = fixed_bins
self.num_warmup = num_warmup
self.num_samples = num_samples
self.stan_seed = stan_seed
self.stan_code = _bayesian.load_stan_file()
self.confidence_level = confidence_level
self.region = region
def aggregation_fit(self, classif_predictions, labels):
y_pred = classif_predictions[:, self.pos_label]
# Compute bin limits
if self.fixed_bins:
# Uniform bins in [0,1]
self.bin_limits = np.linspace(0, 1, self.nbins + 1)
else:
# Quantile bins
self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.nbins + 1))
# Assign each prediction to a bin
bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True)
# Positive and negative masks
pos_mask = (labels == self.pos_label)
neg_mask = ~pos_mask
# Count positives and negatives per bin
self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.nbins)
self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.nbins)
def aggregate(self, classif_predictions):
Px_test = classif_predictions[:, self.pos_label]
test_hist, _ = np.histogram(Px_test, bins=self.bin_limits)
prevs = _bayesian.pq_stan(
self.stan_code, self.nbins, self.pos_hist, self.neg_hist, test_hist,
self.num_samples, self.num_warmup, self.stan_seed
).flatten()
self.prev_distribution = np.vstack([1-prevs, prevs]).T
return self.prev_distribution.mean(axis=0)
def aggregate_conf(self, predictions, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
point_estimate = self.aggregate(predictions)
samples = self.prev_distribution
region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region)
return point_estimate, region
def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
predictions = self.classify(instances)
return self.aggregate_conf(predictions, confidence_level=confidence_level)

View File

@ -1,11 +1,17 @@
from typing import Union, Callable
from itertools import product
from tqdm import tqdm
from typing import Union, Callable, Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from sklearn.preprocessing import normalize
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
from quapy.functional import get_divergence
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
import quapy.functional as F
from scipy.optimize import lsq_linear
from scipy import sparse
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
@ -149,53 +155,164 @@ class DMx(BaseQuantifier):
return F.argmin_prevalence(loss, n_classes, method=self.search)
# class ReadMe(BaseQuantifier):
#
# def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
# raise NotImplementedError('under development ...')
# self.bootstrap_trials = bootstrap_trials
# self.bootstrap_range = bootstrap_range
# self.bagging_trials = bagging_trials
# self.bagging_range = bagging_range
# self.vectorizer_kwargs = vectorizer_kwargs
#
# def fit(self, data: LabelledCollection):
# X, y = data.Xy
# self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
# X = self.vectorizer.fit_transform(X)
# self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
#
# def predict(self, X):
# X = self.vectorizer.transform(X)
#
# # number of features
# num_docs, num_feats = X.shape
#
# # bootstrap
# p_boots = []
# for _ in range(self.bootstrap_trials):
# docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
# class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
# Xboot = X[docs_idx]
#
# # bagging
# p_bags = []
# for _ in range(self.bagging_trials):
# feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
# class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
# Xbag = Xboot[:,feat_idx]
# p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
# p_bags.append(p)
# p_boots.append(np.mean(p_bags, axis=0))
#
# p_mean = np.mean(p_boots, axis=0)
# p_std = np.std(p_bags, axis=0)
#
# return p_mean
#
#
# def std_constrained_linear_ls(self, X, class_cond_X: dict):
# pass
class ReadMe(BaseQuantifier, WithConfidenceABC):
"""
ReadMe is a non-aggregative quantification system proposed by
`Daniel Hopkins and Gary King, 2007. A method of automated nonparametric content analysis for
social science. American Journal of Political Science, 54(1):229247.
<https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-5907.2009.00428.x>`_.
The idea is to estimate `Q(Y=i)` directly from:
:math:`Q(X)=\\sum_{i=1} Q(X|Y=i) Q(Y=i)`
via least-squares regression, i.e., without incurring the cost of computing posterior probabilities.
However, this poses a very difficult representation in which the vector `Q(X)` and the matrix `Q(X|Y=i)`
can be of very high dimensions. In order to render the problem tracktable, ReadMe performs bagging in
the feature space. ReadMe also combines bagging with bootstrap in order to derive confidence intervals
around point estimations.
We use the same default parameters as in the official
`R implementation <https://github.com/iqss-research/ReadMeV1/blob/master/R/prototype.R>`_.
:param prob_model: str ('naive', or 'full'), selects the modality in which the probabilities `Q(X)` and
`Q(X|Y)` are to be modelled. Options include "full", which corresponds to the original formulation of
ReadMe, in which X is constrained to be a binary matrix (e.g., of term presence/absence) and in which
`Q(X)` and `Q(X|Y)` are modelled, respectively, as matrices of `(2^K, 1)` and `(2^K, n)` values, where
`K` is the number of columns in the data matrix (i.e., `bagging_range`), and `n` is the number of classes.
Of course, this approach is computationally prohibited for large `K`, so the computation is restricted to data
matrices with `K<=25` (although we recommend even smaller values of `K`). A much faster model is "naive", which
considers the `Q(X)` and `Q(X|Y)` be multinomial distributions under the `bag-of-words` perspective. In this
case, `bagging_range` can be set to much larger values. Default is "full" (i.e., original ReadMe behavior).
:param bootstrap_trials: int, number of bootstrap trials (default 300)
:param bagging_trials: int, number of bagging trials (default 300)
:param bagging_range: int, number of features to keep for each bagging trial (default 15)
:param confidence_level: float, a value in (0,1) reflecting the desired confidence level (default 0.95)
:param region: str in 'intervals', 'ellipse', 'ellipse-clr'; indicates the preferred method for
defining the confidence region (see :class:`WithConfidenceABC`)
:param random_state: int or None, allows replicability (default None)
:param verbose: bool, whether to display information during the process (default False)
"""
MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION = 25
PROBABILISTIC_MODELS = ["naive", "full"]
def __init__(self,
prob_model="full",
bootstrap_trials=300,
bagging_trials=300,
bagging_range=15,
confidence_level=0.95,
region='intervals',
random_state=None,
verbose=False):
assert prob_model in ReadMe.PROBABILISTIC_MODELS, \
f'unknown {prob_model=}, valid ones are {ReadMe.PROBABILISTIC_MODELS=}'
self.prob_model = prob_model
self.bootstrap_trials = bootstrap_trials
self.bagging_trials = bagging_trials
self.bagging_range = bagging_range
self.confidence_level = confidence_level
self.region = region
self.random_state = random_state
self.verbose = verbose
def fit(self, X, y):
self._check_matrix(X)
self.rng = np.random.default_rng(self.random_state)
self.classes_ = np.unique(y)
Xsize = X.shape[0]
# Bootstrap loop
self.Xboots, self.yboots = [], []
for _ in range(self.bootstrap_trials):
idx = self.rng.choice(Xsize, size=Xsize, replace=True)
self.Xboots.append(X[idx])
self.yboots.append(y[idx])
return self
def predict_conf(self, X, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC):
self._check_matrix(X)
n_features = X.shape[1]
boots_prevalences = []
for Xboots, yboots in tqdm(
zip(self.Xboots, self.yboots),
desc='bootstrap predictions', total=self.bootstrap_trials, disable=not self.verbose
):
bagging_estimates = []
for _ in range(self.bagging_trials):
feat_idx = self.rng.choice(n_features, size=self.bagging_range, replace=False)
Xboots_bagging = Xboots[:, feat_idx]
X_boots_bagging = X[:, feat_idx]
bagging_prev = self._quantify_iteration(Xboots_bagging, yboots, X_boots_bagging)
bagging_estimates.append(bagging_prev)
boots_prevalences.append(np.mean(bagging_estimates, axis=0))
conf = WithConfidenceABC.construct_region(boots_prevalences, confidence_level, method=self.region)
prev_estim = conf.point_estimate()
return prev_estim, conf
def predict(self, X):
prev_estim, _ = self.predict_conf(X)
return prev_estim
def _quantify_iteration(self, Xtr, ytr, Xte):
"""Single ReadMe estimate."""
PX_given_Y = np.asarray([self._compute_P(Xtr[ytr == c]) for i,c in enumerate(self.classes_)])
PX = self._compute_P(Xte)
res = lsq_linear(A=PX_given_Y.T, b=PX, bounds=(0, 1))
pY = np.maximum(res.x, 0)
return pY / pY.sum()
def _check_matrix(self, X):
"""the "full" model requires estimating empirical distributions; due to the high computational cost,
this function is only made available for binary matrices"""
if self.prob_model == 'full' and not self._is_binary_matrix(X):
raise ValueError('the empirical distribution can only be computed efficiently on binary matrices')
def _is_binary_matrix(self, X):
data = X.data if sparse.issparse(X) else X
return np.all((data == 0) | (data == 1))
def _compute_P(self, X):
if self.prob_model == 'naive':
return self._multinomial_distribution(X)
elif self.prob_model == 'full':
return self._empirical_distribution(X)
else:
raise ValueError(f'unknown {self.prob_model}; valid ones are {ReadMe.PROBABILISTIC_MODELS=}')
def _empirical_distribution(self, X):
if X.shape[1] > self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION:
raise ValueError(f'the empirical distribution can only be computed efficiently for dimensions '
f'less or equal than {self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION}')
# we convert every binary row (e.g., 0 0 1 0 1) into the equivalent number (e.g., 5)
K = X.shape[1]
binary_powers = 1 << np.arange(K-1, -1, -1) # (2^K, ..., 32, 16, 8, 4, 2, 1)
X_as_binary_numbers = X @ binary_powers
# count occurrences and compute probs
counts = np.bincount(X_as_binary_numbers, minlength=2 ** K).astype(float)
probs = counts / counts.sum()
return probs
def _multinomial_distribution(self, X):
PX = np.asarray(X.sum(axis=0))
PX = normalize(PX, norm='l1', axis=1)
return PX.ravel()
def _get_features_range(X):

39
quapy/method/stan/pq.stan Normal file
View File

@ -0,0 +1,39 @@
data {
int<lower=0> n_bucket;
array[n_bucket] int<lower=0> train_pos;
array[n_bucket] int<lower=0> train_neg;
array[n_bucket] int<lower=0> test;
int<lower=0,upper=1> posterior;
}
transformed data{
row_vector<lower=0>[n_bucket] train_pos_rv;
row_vector<lower=0>[n_bucket] train_neg_rv;
row_vector<lower=0>[n_bucket] test_rv;
real n_test;
train_pos_rv = to_row_vector( train_pos );
train_neg_rv = to_row_vector( train_neg );
test_rv = to_row_vector( test );
n_test = sum( test );
}
parameters {
simplex[n_bucket] p_neg;
simplex[n_bucket] p_pos;
real<lower=0,upper=1> prev_prior;
}
model {
if( posterior ) {
target += train_neg_rv * log( p_neg );
target += train_pos_rv * log( p_pos );
target += test_rv * log( p_neg * ( 1 - prev_prior) + p_pos * prev_prior );
}
}
generated quantities {
real<lower=0,upper=1> prev;
prev = sum( binomial_rng(test, 1 / ( 1 + (p_neg./p_pos) *(1-prev_prior)/prev_prior ) ) ) / n_test;
}

View File

@ -410,7 +410,7 @@ def group_params(param_grid: dict):
"""
classifier_params, quantifier_params = {}, {}
for key, values in param_grid.items():
if key.startswith('classifier__') or key == 'val_split':
if 'classifier__' in key or key == 'val_split':
classifier_params[key] = values
else:
quantifier_params[key] = values

View File

@ -80,7 +80,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
train_prev = train_prev[pos_class]
ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title)
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)
@ -241,6 +241,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
title=None,
vlines=None,
method_order=None,
fontsize=18,
savepath=None):
"""
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
@ -276,6 +277,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
:return: returns (fig, ax) matplotlib objects for eventual customisation
"""
plt.rcParams['font.size'] = fontsize
fig, ax = plt.subplots()
ax.grid()
@ -311,7 +314,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
for p,ind in enumerate(range(len(bins))):
selected = inds==ind
if selected.sum() > 0:
xs.append(ind*binwidth-binwidth/2)
xs.append(ind*binwidth)
ys.append(np.mean(method_drifts[selected]))
ystds.append(np.std(method_drifts[selected]))
npoints[p] += len(method_drifts[selected])
@ -320,6 +323,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ys = np.asarray(ys)
ystds = np.asarray(ystds)
# if ys[-1]<ys[-2]:
# ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2
min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
@ -342,8 +348,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax2.spines['right'].set_color('g')
ax2.tick_params(axis='y', colors='g')
ax.set(xlabel=f'Prior shift between training set and test sample',
ylabel=f'{error_name.upper()} (true prev, predicted prev)',
ax.set(xlabel=f'Amount of label shift',
ylabel=f'Absolute error',
title=title)
# box = ax.get_position()
# ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
@ -357,9 +363,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
ax.set_ylim(0,10 ** math.ceil(math.log10(max_y)))
if show_legend:
fig.legend(loc='center left',
bbox_to_anchor=(1, 0.5),
ncol=1)
ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5))
# fig.legend(loc='lower center',
# bbox_to_anchor=(1, 0.5),
# ncol=(len(method_names)+1)//2)
_save_or_show(savepath)

View File

@ -111,6 +111,12 @@ setup(
#
packages=find_packages(include=['quapy', 'quapy.*']), # Required
package_data={
# For the 'quapy.method' package, include all files
# in the 'stan' subdirectory that end with .stan
'quapy.method': ['stan/*.stan']
},
python_requires='>=3.8, <4',
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib', 'joblib', 'xlrd', 'abstention', 'ucimlrepo', 'certifi'],
@ -124,7 +130,7 @@ setup(
# Similar to `install_requires` above, these must be valid existing
# projects.
extras_require={ # Optional
'bayes': ['jax', 'jaxlib', 'numpyro'],
'bayes': ['jax', 'jaxlib', 'numpyro', 'pystan'],
'neural': ['torch'],
'tests': ['certifi'],
'docs' : ['sphinx-rtd-theme', 'myst-parser'],