diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt new file mode 100644 index 0000000..9aea717 --- /dev/null +++ b/BayesianKDEy/TODO.txt @@ -0,0 +1,7 @@ +- Add other methods that natively provide uncertainty quantification methods? +- MPIW (Mean Prediction Interval Width): is the average of the amplitudes (w/o aggregating coverage whatsoever) +- Implement Interval Score or Winkler Score +- analyze across shift +- add Bayesian EM +- optimize also C and class_weight? + diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py new file mode 100644 index 0000000..e20db79 --- /dev/null +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -0,0 +1,215 @@ +from numpy.ma.core import shape +from sklearn.base import BaseEstimator +import numpy as np + +import quapy.util +from quapy.method._kdey import KDEBase +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC +from quapy.functional import CLRtransformation, ILRtransformation +from quapy.method.aggregative import AggregativeSoftQuantifier +from tqdm import tqdm +import quapy.functional as F +#import emcee + + + +class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): + """ + `Bayesian version of KDEy. + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. + :param num_warmup: number of warmup iterations for the MCMC sampler (default 500) + :param num_samples: number of samples to draw from the posterior (default 1000) + :param mcmc_seed: random seed for the MCMC sampler (default 0) + :param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95) + :param region: string, set to `intervals` for constructing confidence intervals (default), or to + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. + :param verbose: bool, whether to display progress bar + """ + def __init__(self, + classifier: BaseEstimator=None, + fit_classifier=True, + val_split: int = 5, + kernel='gaussian', + bandwidth=0.1, + num_warmup: int = 500, + num_samples: int = 1_000, + mcmc_seed: int = 0, + confidence_level: float = 0.95, + region: str = 'intervals', + explore='simplex', + step_size=0.05, + temperature=1., + verbose: bool = False): + + if num_warmup <= 0: + raise ValueError(f'parameter {num_warmup=} must be a positive integer') + if num_samples <= 0: + raise ValueError(f'parameter {num_samples=} must be a positive integer') + assert explore in ['simplex', 'clr', 'ilr'], \ + f'unexpected value for param {explore=}; valid ones are "simplex", "clr", and "ilr"' + assert temperature>0., f'temperature must be >0' + + super().__init__(classifier, fit_classifier, val_split) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) + self.kernel = self._check_kernel(kernel) + self.num_warmup = num_warmup + self.num_samples = num_samples + self.mcmc_seed = mcmc_seed + self.confidence_level = confidence_level + self.region = region + self.explore = explore + self.step_size = step_size + self.temperature = temperature + self.verbose = verbose + + def aggregation_fit(self, classif_predictions, labels): + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel) + return self + + def aggregate(self, classif_predictions): + # self.prevalence_samples = self._bayesian_kde(classif_predictions, init=None, verbose=self.verbose) + self.prevalence_samples = self._bayesian_emcee(classif_predictions) + return self.prevalence_samples.mean(axis=0) + + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + if confidence_level is None: + confidence_level = self.confidence_level + classif_predictions = self.classify(instances) + point_estimate = self.aggregate(classif_predictions) + samples = self.prevalence_samples # available after calling "aggregate" function + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) + return point_estimate, region + + def _bayesian_kde(self, X_probs, init=None, verbose=False): + """ + Bayes: + P(prev|data) = P(data|prev) P(prev) / P(data) + i.e., + posterior = likelihood * prior / evidence + we assume the likelihood be: + prev @ [kde_i_likelihood(data) 1..i..n] + prior be uniform in simplex + """ + + rng = np.random.default_rng(self.mcmc_seed) + kdes = self.mix_densities + test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes]) + + def log_likelihood(prev, epsilon=1e-10): + test_likelihoods = prev @ test_densities + test_loglikelihood = np.log(test_likelihoods + epsilon) + return (1./self.temperature) * np.sum(test_loglikelihood) + + # def log_prior(prev): + # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) + # return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant + + # def log_prior(prev, alpha_scale=1000): + # alpha = np.array(init) * alpha_scale + # return dirichlet.logpdf(prev, alpha) + + def log_prior(prev): + return 0 + + def sample_neighbour(prev, step_size): + # random-walk Metropolis-Hastings + d = len(prev) + neighbour = None + if self.explore=='simplex': + dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d) + neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') + elif self.explore=='clr': + clr = CLRtransformation() + clr_point = clr(prev) + dir_noise = rng.normal(scale=step_size, size=d) + clr_neighbour = clr_point+dir_noise + neighbour = clr.inverse(clr_neighbour) + assert in_simplex(neighbour), 'wrong CLR transformation' + elif self.explore=='ilr': + ilr = ILRtransformation() + ilr_point = ilr(prev) + dir_noise = rng.normal(scale=step_size, size=d-1) + ilr_neighbour = ilr_point + dir_noise + neighbour = ilr.inverse(ilr_neighbour) + assert in_simplex(neighbour), 'wrong ILR transformation' + return neighbour + + n_classes = X_probs.shape[1] + current_prev = F.uniform_prevalence(n_classes) if init is None else init + current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) + + # Metropolis-Hastings with adaptive rate + step_size = self.step_size + target_acceptance = 0.3 + adapt_rate = 0.05 + acceptance_history = [] + + samples = [] + total_steps = self.num_samples + self.num_warmup + for i in tqdm(range(total_steps), total=total_steps, disable=not verbose): + proposed_prev = sample_neighbour(current_prev, step_size) + + # probability of acceptance + proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev) + acceptance = proposed_likelihood - current_likelihood + + # decide acceptance + accepted = np.log(rng.random()) < acceptance + if accepted: + current_prev = proposed_prev + current_likelihood = proposed_likelihood + + samples.append(current_prev) + acceptance_history.append(1. if accepted else 0.) + + # if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: + if i % 10 == 0 and len(acceptance_history) >= 100: + recent_accept_rate = np.mean(acceptance_history[-100:]) + step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) + # step_size = float(np.clip(step_size, min_step, max_step)) + if i %100==0: + print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') + + # remove "warmup" initial iterations + samples = np.asarray(samples[self.num_warmup:]) + return samples + + def _bayesian_emcee(self, X_probs): + ndim = X_probs.shape[1] + nwalkers = 32 + + f = CLRtransformation() + + def log_likelihood(unconstrained, test_densities, epsilon=1e-10): + prev = f.inverse(unconstrained) + test_likelihoods = prev @ test_densities + test_loglikelihood = np.log(test_likelihoods + epsilon) + return np.sum(test_loglikelihood) + + kdes = self.mix_densities + test_densities = np.asarray([self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes]) + + # p0 = np.random.normal(nwalkers, ndim) + p0 = F.uniform_prevalence_sampling(ndim, nwalkers) + p0 = f(p0) + sampler = emcee.EnsembleSampler(nwalkers, ndim, log_likelihood, args=[test_densities]) + + state = sampler.run_mcmc(p0, self.num_warmup, skip_initial_state_check=True) + sampler.reset() + sampler.run_mcmc(state, self.num_samples, skip_initial_state_check=True) + samples = sampler.get_chain(flat=True) + samples = f.inverse(samples) + return samples + +def in_simplex(x): + return np.all(x >= 0) and np.isclose(x.sum(), 1) diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py new file mode 100644 index 0000000..0a1dad2 --- /dev/null +++ b/BayesianKDEy/full_experiments.py @@ -0,0 +1,196 @@ +import os +import warnings +from os.path import join +from pathlib import Path + +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression as LR +from sklearn.model_selection import GridSearchCV, StratifiedKFold +from copy import deepcopy as cp +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from build.lib.quapy.data import LabelledCollection +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier, EMQ +from quapy.method.base import BinaryQuantifier, BaseQuantifier +from quapy.model_selection import GridSearchQ +from quapy.data import Dataset +# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML, ACC +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet +from collections import defaultdict +from time import time +from sklearn.base import clone, BaseEstimator + + +class KDEyCLR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='aitchison' + ) + + +class KDEyILR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='ilr' + ) + + +def methods(): + """ + Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: + - name: is a str representing the name of the method (e.g., 'BayesianKDEy') + - quantifier: is the base model (e.g., KDEyML()) + - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) + - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the + quantifier with optimized hyperparameters + """ + acc_hyper = {} + emq_hyper = {'calib': ['nbvs', 'bcts', 'ts', 'vs']} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + multiclass_method = 'multiclass' + only_binary = 'only_binary' + only_multiclass = 'only_multiclass' + + # yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), multiclass_method + # yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0), multiclass_method + + yield 'BootstrapEMQ', EMQ(LR(), on_calib_error='backup', val_split=5), emq_hyper, lambda hyper: AggregativeBootstrap(EMQ(LR(), on_calib_error='backup', calib=hyper['calib'], val_split=5), n_test_samples=1000, random_state=0), multiclass_method + + # yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), multiclass_method + # yield 'BayesianHDy', DMy(LR()), hdy_hyper, lambda hyper: PQ(LR(), stan_seed=0, **hyper), only_binary + # + # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), multiclass_method + # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), multiclass_method + # yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), multiclass_method + # yield 'BayKDEy*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.15, **hyper), multiclass_method + # yield 'BayKDEy*CLR2', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='clr', step_size=.05, **hyper), multiclass_method + # yield 'BayKDEy*ILR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, explore='ilr', step_size=.15, **hyper), only_multiclass + # yield 'BayKDEy*ILR2', KDEyILR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='ilr', mcmc_seed=0, explore='ilr', step_size=.1, **hyper), only_multiclass + + +def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): + with qp.util.temp_seed(0): + print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') + # model selection + if len(grid)>0: + train, val = train.split_stratified(train_prop=0.6, random_state=0) + mod_sel = GridSearchQ( + model=point_quantifier, + param_grid=grid, + protocol=qp.protocol.UPP(val, repeats=250, random_state=0), + refit=False, + n_jobs=-1, + verbose=True + ).fit(*train.Xy) + best_params = mod_sel.best_params_ + else: + best_params = {} + + return best_params + + +def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path): + with qp.util.temp_seed(0): + + training, test = dataset.train_test + + # model selection + best_hyperparams = qp.util.pickled_resource( + hyper_choice_path, model_selection, training, cp(point_quantifier), grid + ) + + t_init = time() + withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy) + tr_time = time() - t_init + + # test + train_prevalence = training.prevalence() + results = defaultdict(list) + test_generator = UPP(test, repeats=100, random_state=0) + for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'): + t_init = time() + point_estimate, region = withconf_quantifier.predict_conf(sample_X) + ttime = time()-t_init + results['true-prevs'].append(true_prevalence) + results['point-estim'].append(point_estimate) + results['shift'].append(qp.error.ae(true_prevalence, train_prevalence)) + results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate)) + results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate)) + results['coverage'].append(region.coverage(true_prevalence)) + results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000)) + results['test-time'].append(ttime) + results['samples'].append(region.samples) + + report = { + 'optim_hyper': best_hyperparams, + 'train_time': tr_time, + 'train-prev': train_prevalence, + 'results': {k:np.asarray(v) for k,v in results.items()} + } + + return report + + +def experiment_path(dir:Path, dataset_name:str, method_name:str): + os.makedirs(dir, exist_ok=True) + return dir/f'{dataset_name}__{method_name}.pkl' + + +if __name__ == '__main__': + + binary = { + 'datasets': qp.datasets.UCI_BINARY_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, + 'sample_size': 500 + } + + multiclass = { + 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, + 'sample_size': 1000 + } + + result_dir = Path('./results') + + for setup in [binary, multiclass]: # [binary, multiclass]: + qp.environ['SAMPLE_SIZE'] = setup['sample_size'] + for data_name in setup['datasets']: + print(f'dataset={data_name}') + # if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): + # print(f'skipping dataset: {data_name}') + # continue + data = setup['fetch_fn'](data_name) + is_binary = data.n_classes==2 + result_subdir = result_dir / ('binary' if is_binary else 'multiclass') + hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') + for method_name, method, hyper_params, withconf_constructor, method_scope in methods(): + if method_scope == 'only_binary' and not is_binary: + continue + if method_scope == 'only_multiclass' and is_binary: + continue + result_path = experiment_path(result_subdir, data_name, method_name) + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = qp.util.pickled_resource( + result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path + ) + print(f'dataset={data_name}, ' + f'method={method_name}: ' + f'mae={report["results"]["ae"].mean():.3f}, ' + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') + + + + diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py new file mode 100644 index 0000000..04dedd9 --- /dev/null +++ b/BayesianKDEy/generate_results.py @@ -0,0 +1,170 @@ +import pickle +from collections import defaultdict + +from joblib import Parallel, delayed +from tqdm import tqdm +import pandas as pd +from glob import glob +from pathlib import Path +import quapy as qp +from error import dist_aitchison +from quapy.method.confidence import ConfidenceIntervals +from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC + +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 2000) +pd.set_option('display.max_rows', None) +pd.set_option("display.expand_frame_repr", False) +pd.set_option("display.precision", 4) +pd.set_option("display.float_format", "{:.4f}".format) + + +def region_score(true_prev, region: ConfidenceRegionABC): + amp = region.montecarlo_proportion(50_000) + if true_prev in region: + cost = 0 + else: + scale_cost = 1/region.alpha + cost = scale_cost * dist_aitchison(true_prev, region.closest_point_in_region(true_prev)) + return amp + cost + + + +def compute_coverage_amplitude(region_constructor, **kwargs): + all_samples = results['samples'] + all_true_prevs = results['true-prevs'] + + def process_one(samples, true_prevs): + region = region_constructor(samples, **kwargs) + if isinstance(region, ConfidenceIntervals): + winkler = region.mean_winkler_score(true_prevs) + else: + winkler = None + return region.coverage(true_prevs), region.montecarlo_proportion(), winkler + + out = Parallel(n_jobs=3)( + delayed(process_one)(samples, true_prevs) + for samples, true_prevs in tqdm( + zip(all_samples, all_true_prevs), + total=len(all_samples), + desc='constructing ellipses' + ) + ) + + # unzip results + coverage, amplitude, winkler = zip(*out) + return list(coverage), list(amplitude), list(winkler) + + +def update_pickle(report, pickle_path, updated_dict:dict): + for k,v in updated_dict.items(): + report[k]=v + pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + + +def update_pickle_with_region(report, file, conf_name, conf_region_class, **kwargs): + if f'coverage-{conf_name}' not in report: + covs, amps, winkler = compute_coverage_amplitude(conf_region_class, **kwargs) + + update_fields = { + f'coverage-{conf_name}': covs, + f'amplitude-{conf_name}': amps, + f'winkler-{conf_name}': winkler + } + + update_pickle(report, file, update_fields) + +methods = None # show all methods +# methods = ['BayesianACC', 'BayesianKDEy'] + +for setup in ['multiclass']: + path = f'./results/{setup}/*.pkl' + table = defaultdict(list) + for file in tqdm(glob(path), desc='processing results', total=len(glob(path))): + file = Path(file) + dataset, method = file.name.replace('.pkl', '').split('__') + if methods is not None and method not in methods: + continue + report = pickle.load(open(file, 'rb')) + results = report['results'] + n_samples = len(results['ae']) + table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples) + table['dataset'].extend([dataset] * n_samples) + table['ae'].extend(results['ae']) + table['rae'].extend(results['rae']) + # table['c-CI'].extend(results['coverage']) + # table['a-CI'].extend(results['amplitude']) + + update_pickle_with_region(report, file, conf_name='CI', conf_region_class=ConfidenceIntervals, bonferroni_correction=True) + update_pickle_with_region(report, file, conf_name='CE', conf_region_class=ConfidenceEllipseSimplex) + update_pickle_with_region(report, file, conf_name='CLR', conf_region_class=ConfidenceEllipseCLR) + update_pickle_with_region(report, file, conf_name='ILR', conf_region_class=ConfidenceEllipseILR) + + table['c-CI'].extend(report['coverage-CI']) + table['a-CI'].extend(report['amplitude-CI']) + table['w-CI'].extend(report['winkler-CI']) + + table['c-CE'].extend(report['coverage-CE']) + table['a-CE'].extend(report['amplitude-CE']) + + table['c-CLR'].extend(report['coverage-CLR']) + table['a-CLR'].extend(report['amplitude-CLR']) + + table['c-ILR'].extend(report['coverage-ILR']) + table['a-ILR'].extend(report['amplitude-ILR']) + + table['aitch'].extend(qp.error.dist_aitchison(results['true-prevs'], results['point-estim'])) + # table['aitch-well'].extend(qp.error.dist_aitchison(results['true-prevs'], [ConfidenceEllipseILR(samples).mean_ for samples in results['samples']])) + # table['aitch'].extend() + table['reg-score-ILR'].extend( + [region_score(true_prev, ConfidenceEllipseILR(samples)) for true_prev, samples in zip(results['true-prevs'], results['samples'])] + ) + + + + df = pd.DataFrame(table) + + n_classes = {} + tr_size = {} + for dataset in df['dataset'].unique(): + fetch_fn = { + 'binary': qp.datasets.fetch_UCIBinaryDataset, + 'multiclass': qp.datasets.fetch_UCIMulticlassDataset + }[setup] + data = fetch_fn(dataset) + n_classes[dataset] = data.n_classes + tr_size[dataset] = len(data.training) + + # remove datasets with more than max_classes classes + # max_classes = 30 + # min_train = 1000 + # for data_name, n in n_classes.items(): + # if n > max_classes: + # df = df[df["dataset"] != data_name] + # for data_name, n in tr_size.items(): + # if n < min_train: + # df = df[df["dataset"] != data_name] + + for region in ['ILR']: # , 'CI', 'CE', 'CLR', 'ILR']: + if setup == 'binary' and region=='ILR': + continue + # pv = pd.pivot_table( + # df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True + # ) + pv = pd.pivot_table( + df, index='dataset', columns='method', values=[ + #f'w-{region}', + # 'ae', + # 'rae', + # f'aitch', + # f'aitch-well' + 'reg-score-ILR', + ], margins=True + ) + pv['n_classes'] = pv.index.map(n_classes).astype('Int64') + pv['tr_size'] = pv.index.map(tr_size).astype('Int64') + pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"]) + print(f'{setup=}') + print(pv) + print('-'*80) + diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py new file mode 100644 index 0000000..d0128ba --- /dev/null +++ b/BayesianKDEy/plot_simplex.py @@ -0,0 +1,258 @@ +import os +import pickle +from pathlib import Path + +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap +from scipy.stats import gaussian_kde + +from method.confidence import (ConfidenceIntervals as CI, + ConfidenceEllipseSimplex as CE, + ConfidenceEllipseCLR as CLR, + ConfidenceEllipseILR as ILR) + + + +def get_region_colormap(name="blue", alpha=0.40): + name = name.lower() + if name == "blue": + base = (76/255, 114/255, 176/255) + elif name == "orange": + base = (221/255, 132/255, 82/255) + elif name == "violet": + base = (129/255, 114/255, 178/255) + else: + raise ValueError(f"Unknown palette name: {name}") + + cmap = ListedColormap([ + (1, 1, 1, 0), # 0: transparent white + (base[0], base[1], base[2], alpha) # 1: color + ]) + + return cmap + + +def plot_prev_points(samples=None, + show_samples=True, + true_prev=None, + point_estim=None, train_prev=None, show_mean=True, show_legend=True, + region=None, + region_resolution=1000, + confine_region_in_simplex=False, + color='blue', + save_path=None): + + plt.rcParams.update({ + 'font.size': 10, # tamaño base de todo el texto + 'axes.titlesize': 12, # título del eje + 'axes.labelsize': 10, # etiquetas de ejes + 'xtick.labelsize': 8, # etiquetas de ticks + 'ytick.labelsize': 8, + 'legend.fontsize': 9, # leyenda + }) + + def cartesian(p): + dim = p.shape[-1] + p = p.reshape(-1,dim) + x = p[:, 1] + p[:, 2] * 0.5 + y = p[:, 2] * np.sqrt(3) / 2 + return x, y + + def barycentric_from_xy(x, y): + """ + Given cartesian (x,y) in simplex returns baricentric coordinates (p1,p2,p3). + """ + p3 = 2 * y / np.sqrt(3) + p2 = x - 0.5 * p3 + p1 = 1 - p2 - p3 + return np.stack([p1, p2, p3], axis=-1) + + # simplex coordinates + v1 = np.array([0, 0]) + v2 = np.array([1, 0]) + v3 = np.array([0.5, np.sqrt(3)/2]) + + # Plot + fig, ax = plt.subplots(figsize=(6, 6)) + + if region is not None: + if callable(region): + region_list = [("region", region)] + else: + region_list = region # lista de (name, fn) + + if region is not None: + # rectangular mesh + x_min, x_max = -0.2, 1.2 + y_min, y_max = -0.2, np.sqrt(3) / 2 + 0.2 + + xs = np.linspace(x_min, x_max, region_resolution) + ys = np.linspace(y_min, y_max, region_resolution) + grid_x, grid_y = np.meshgrid(xs, ys) + + # barycentric + pts_bary = barycentric_from_xy(grid_x, grid_y) + + # mask within simplex + if confine_region_in_simplex: + in_simplex = np.all(pts_bary >= 0, axis=-1) + else: + in_simplex = np.full(shape=(region_resolution, region_resolution), fill_value=True, dtype=bool) + + # --- Colormap 0 → blanco, 1 → rojo semitransparente --- + + # iterar sobre todas las regiones + for (rname, rfun) in region_list: + mask = np.zeros_like(in_simplex, dtype=float) + valid_pts = pts_bary[in_simplex] + mask_vals = np.array([float(rfun(p)) for p in valid_pts]) + mask[in_simplex] = mask_vals + + ax.pcolormesh( + xs, ys, mask, + shading='auto', + cmap=get_region_colormap(color), + alpha=0.3, + ) + + if samples is not None: + if show_samples: + ax.scatter(*cartesian(samples), s=15, alpha=0.5, edgecolors='none', label='samples', color='black', linewidth=0.5) + if show_mean is not None: + if isinstance(show_mean, np.ndarray): + ax.scatter(*cartesian(show_mean), s=10, alpha=1, label='sample-mean', edgecolors='black') + elif show_mean==True and samples is not None: + ax.scatter(*cartesian(samples.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') + else: + raise ValueError(f'show_mean should either be a boolean (if True, then samples must be provided) or ' + f'the mean point itself') + if train_prev is not None: + ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') + if point_estim is not None: + ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') + if train_prev is not None: + ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black') + + # edges + triangle = np.array([v1, v2, v3, v1]) + ax.plot(triangle[:, 0], triangle[:, 1], color='black') + + # vertex labels + ax.text(-0.05, -0.05, "Y=1", ha='right', va='top') + ax.text(1.05, -0.05, "Y=2", ha='left', va='top') + ax.text(0.5, np.sqrt(3)/2 + 0.05, "Y=3", ha='center', va='bottom') + + ax.set_aspect('equal') + ax.axis('off') + if show_legend: + plt.legend( + loc='center left', + bbox_to_anchor=(1.05, 0.5), + ) + plt.tight_layout() + if save_path is None: + plt.show() + else: + os.makedirs(Path(save_path).parent, exist_ok=True) + plt.savefig(save_path) + + +def plot_prev_points_matplot(points): + + # project 2D + v1 = np.array([0, 0]) + v2 = np.array([1, 0]) + v3 = np.array([0.5, np.sqrt(3) / 2]) + x = points[:, 1] + points[:, 2] * 0.5 + y = points[:, 2] * np.sqrt(3) / 2 + + # kde + xy = np.vstack([x, y]) + kde = gaussian_kde(xy, bw_method=0.25) + xmin, xmax = 0, 1 + ymin, ymax = 0, np.sqrt(3) / 2 + + # grid + xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j] + positions = np.vstack([xx.ravel(), yy.ravel()]) + zz = np.reshape(kde(positions).T, xx.shape) + + # mask points in simplex + def in_triangle(x, y): + return (y >= 0) & (y <= np.sqrt(3) * np.minimum(x, 1 - x)) + + mask = in_triangle(xx, yy) + zz_masked = np.ma.array(zz, mask=~mask) + + # plot + fig, ax = plt.subplots(figsize=(6, 6)) + ax.imshow( + np.rot90(zz_masked), + cmap=plt.cm.viridis, + extent=[xmin, xmax, ymin, ymax], + alpha=0.8, + ) + + # Bordes del triángulo + triangle = np.array([v1, v2, v3, v1]) + ax.plot(triangle[:, 0], triangle[:, 1], color='black', lw=2) + + # Puntos (opcional) + ax.scatter(x, y, s=5, c='white', alpha=0.3) + + # Etiquetas + ax.text(-0.05, -0.05, "A (1,0,0)", ha='right', va='top') + ax.text(1.05, -0.05, "B (0,1,0)", ha='left', va='top') + ax.text(0.5, np.sqrt(3) / 2 + 0.05, "C (0,0,1)", ha='center', va='bottom') + + ax.set_aspect('equal') + ax.axis('off') + plt.show() + +if __name__ == '__main__': + np.random.seed(1) + + n = 1000 + # alpha = [3,5,10] + alpha = [10,1,1] + prevs = np.random.dirichlet(alpha, size=n) + + def regions(): + confs = [0.99, 0.95, 0.90] + yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] + # yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] + # yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] + + # resolution = 1000 + # alpha_str = ','.join([f'{str(i)}' for i in alpha]) + # for crname, cr in regions(): + # plot_prev_points(prevs, show_mean=True, show_legend=False, region=cr, region_resolution=resolution, + # color='blue', + # save_path=f'./plots/simplex_{crname}_alpha{alpha_str}_res{resolution}.png', + # ) + + + def regions(): + confs = [0.99, 0.95, 0.90] + yield 'CI', [(f'{int(c*100)}%', CI(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CI-b', [(f'{int(c * 100)}%', CI(prevs, confidence_level=c, bonferroni_correction=True).coverage) for c in confs] + # yield 'CE', [(f'{int(c*100)}%', CE(prevs, confidence_level=c).coverage) for c in confs] + # yield 'CLR', [(f'{int(c*100)}%', CLR(prevs, confidence_level=c).coverage) for c in confs] + # yield 'ILR', [(f'{int(c*100)}%', ILR(prevs, confidence_level=c).coverage) for c in confs] + + resolution = 1000 + alpha_str = ','.join([f'{str(i)}' for i in alpha]) + region = ILR(prevs, confidence_level=.99) + p = np.asarray([0.1, 0.8, 0.1]) + plot_prev_points(prevs, show_samples=False, + show_mean=region.mean_, + # show_mean=prevs.mean(axis=0), + show_legend=False, region=[('', region.coverage)], region_resolution=resolution, + color='blue', + true_prev=p, + train_prev=region.closest_point_in_region(p), + save_path=f'./plots3/simplex_ilr.png', + ) diff --git a/BayesianKDEy/quick_experiment_bayesian_kdey.py b/BayesianKDEy/quick_experiment_bayesian_kdey.py new file mode 100644 index 0000000..b63665c --- /dev/null +++ b/BayesianKDEy/quick_experiment_bayesian_kdey.py @@ -0,0 +1,56 @@ +import warnings + +from sklearn.linear_model import LogisticRegression +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from method.confidence import ConfidenceIntervals +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet + + + + +if __name__ == '__main__': + qp.environ["SAMPLE_SIZE"] = 500 + cls = LogisticRegression() + bayes_kdey = BayesianKDEy(cls, bandwidth=.3, kernel='aitchison', mcmc_seed=0) + + datasets = qp.datasets.UCI_BINARY_DATASETS + train, test = qp.datasets.fetch_UCIBinaryDataset(datasets[0]).train_test + + # train, test = qp.datasets.fetch_UCIMulticlassDataset('academic-success', standardize=True).train_test + + with qp.util.temp_seed(0): + print('fitting KDEy') + bayes_kdey.fit(*train.Xy) + + shifted = test.sampling(500, *[0.2, 0.8]) + # shifted = test.sampling(500, *test.prevalence()[::-1]) + # shifted = test.sampling(500, *F.uniform_prevalence_sampling(train.n_classes)) + prev_hat = bayes_kdey.predict(shifted.X) + mae = qp.error.mae(shifted.prevalence(), prev_hat) + print(f'true_prev={strprev(shifted.prevalence())}') + print(f'prev_hat={strprev(prev_hat)}, {mae=:.4f}') + + prev_hat, conf_interval = bayes_kdey.predict_conf(shifted.X) + + mae = qp.error.mae(shifted.prevalence(), prev_hat) + print(f'mean posterior {strprev(prev_hat)}, {mae=:.4f}') + print(f'CI={conf_interval}') + print(f'\tcontains true={conf_interval.coverage(true_value=shifted.prevalence())==1}') + print(f'\tamplitude={conf_interval.montecarlo_proportion(50_000)*100.:.3f}%') + + if train.n_classes == 3: + plot_prev_points(bayes_kdey.prevalence_samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) + # plot_prev_points_matplot(samples) + + # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) + # print(report.mean(numeric_only=True)) + + diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py new file mode 100644 index 0000000..a33df67 --- /dev/null +++ b/BayesianKDEy/single_experiment_debug.py @@ -0,0 +1,91 @@ +import os +import warnings +from os.path import join +from pathlib import Path + +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression as LR +from sklearn.model_selection import GridSearchCV, StratifiedKFold +from copy import deepcopy as cp +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR +from build.lib.quapy.data import LabelledCollection +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier +from quapy.method.base import BinaryQuantifier, BaseQuantifier +from quapy.model_selection import GridSearchQ +from quapy.data import Dataset +# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML, ACC +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet +from collections import defaultdict +from time import time +from sklearn.base import clone, BaseEstimator + + +def methods(): + """ + Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: + - name: is a str representing the name of the method (e.g., 'BayesianKDEy') + - quantifier: is the base model (e.g., KDEyML()) + - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) + - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the + quantifier with optimized hyperparameters + """ + acc_hyper = {} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} + + # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), + # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), + # for T in [10, 20, 50, 100., 500]: + # yield f'BaKDE-CLR-T{T}', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', explore='ilr', mcmc_seed=0, temperature=T, num_warmup=3000, num_samples=1000, step_size=.1, **hyper), + + yield f'BaKDE-emcee', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, num_warmup=100, num_samples=100, step_size=.1, **hyper), + + + +if __name__ == '__main__': + + binary = { + 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, + 'sample_size': 500 + } + + multiclass = { + 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, + 'sample_size': 1000 + } + + setup = multiclass + # data_name = 'isolet' + # data_name = 'cmc' + data_name = 'abalone' + + qp.environ['SAMPLE_SIZE'] = setup['sample_size'] + print(f'dataset={data_name}') + data = setup['fetch_fn'](data_name) + is_binary = data.n_classes==2 + hyper_subdir = Path('./results') / 'hyperparams' / ('binary' if is_binary else 'multiclass') + for method_name, method, hyper_params, withconf_constructor in methods(): + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path) + + print(f'dataset={data_name}, ' + f'method={method_name}: ' + f'mae={report["results"]["ae"].mean():.3f}, ' + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') + + + + diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index b6066b9..9761c29 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -1,9 +1,13 @@ Change Log 0.2.1 ----------------- +- Added squared ratio error. +- Improved efficiency of confidence regions coverage functions +- Added Precise Quantifier to WithConfidence methods (a Bayesian adaptation of HDy) - Improved documentation of confidence regions. - Added ReadMe method by Daniel Hopkins and Gary King - Internal index in LabelledCollection is now "lazy", and is only constructed if required. +- I have added dist_aitchison and mean_dist_aitchison as a new error evaluation metric Change Log 0.2.0 ----------------- diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py new file mode 100644 index 0000000..5e3db50 --- /dev/null +++ b/KDEyAitchison/commons.py @@ -0,0 +1,56 @@ +import numpy as np +import pandas as pd + +from quapy.method.aggregative import EMQ, KDEyML, PACC +from sklearn.linear_model import LogisticRegression + +METHODS = ['PACC', + 'EMQ', + 'KDEy-ML', + 'KDEy-MLA' + ] + + +# common hyperparameterss +hyper_LR = { + 'classifier__C': np.logspace(-3, 3, 7), + 'classifier__class_weight': ['balanced', None] +} + +hyper_kde = { + 'bandwidth': np.linspace(0.001, 0.5, 100) +} + +hyper_kde_aitchison = { + 'bandwidth': np.linspace(0.01, 2, 100) +} + +# instances a new quantifier based on a string name +def new_method(method, **lr_kwargs): + lr = LogisticRegression(**lr_kwargs) + + if method == 'KDEy-ML': + param_grid = {**hyper_kde, **hyper_LR} + quantifier = KDEyML(lr, kernel='gaussian') + elif method == 'KDEy-MLA': + param_grid = {**hyper_kde_aitchison, **hyper_LR} + quantifier = KDEyML(lr, kernel='aitchison') + elif method == 'EMQ': + param_grid = hyper_LR + quantifier = EMQ(lr) + elif method == 'PACC': + param_grid = hyper_LR + quantifier = PACC(lr) + else: + raise NotImplementedError('unknown method', method) + + return param_grid, quantifier + + +def show_results(result_path): + df = pd.read_csv(result_path+'.csv', sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py new file mode 100644 index 0000000..d5e867c --- /dev/null +++ b/KDEyAitchison/show_results.py @@ -0,0 +1,38 @@ +import pickle +import os +import sys + +import pandas as pd + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from commons import METHODS, new_method, show_results +from new_table import LatexTable + + +SEED = 1 + + + +if __name__ == '__main__': + print(qp.datasets.UCI_MULTICLASS_DATASETS) + for optim in ['mae', 'mrae']: + table = LatexTable() + result_dir = f'results/ucimulti/{optim}' + + for method in METHODS: + print() + global_result_path = f'{result_dir}/{method}' + print(f'Method\tDataset\tMAE\tMRAE\tKLD') + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + # print(dataset) + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + report = pd.read_csv(local_result_path+'.dataframe') + print(f'{method}\t{dataset}\t{report[optim].mean():.5f}') + table.add(benchmark=dataset, method=method, v=report[optim].values) + else: + print(dataset, 'not found for method', method) + table.latexPDF(f'./tables/{optim}.pdf', landscape=False) + diff --git a/KDEyAitchison/ucimulti_experiments.py b/KDEyAitchison/ucimulti_experiments.py new file mode 100644 index 0000000..0852edb --- /dev/null +++ b/KDEyAitchison/ucimulti_experiments.py @@ -0,0 +1,94 @@ +import pickle +import os +import sys + +import pandas as pd + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from commons import METHODS, new_method, show_results + + +SEED = 1 + + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + for optim in ['mae', 'mrae']: + result_dir = f'results/ucimulti/{optim}' + + os.makedirs(result_dir, exist_ok=True) + + for method in METHODS: + + print('Init method', method) + + global_result_path = f'{result_dir}/{method}' + # show_results(global_result_path) + # sys.exit(0) + + if not os.path.exists(global_result_path + '.csv'): + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + + print('init', dataset) + + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + print(f'result file {local_result_path}.dataframe already exist; skipping') + report = pd.read_csv(local_result_path+'.dataframe') + print(report["mae"].mean()) + # data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + # csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + continue + + with qp.util.temp_seed(SEED): + + param_grid, quantifier = new_method(method, max_iter=3000) + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim + ) + + try: + modsel.fit(*train.Xy) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(*train.Xy) + + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True + ) + report.to_csv(f'{local_result_path}.dataframe') + print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/quapy/error.py b/quapy/error.py index eb42cd6..2407eb0 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -3,6 +3,7 @@ import numpy as np from sklearn.metrics import f1_score import quapy as qp +from functional import CLRtransformation def from_name(err_name): @@ -128,6 +129,59 @@ def se(prevs_true, prevs_hat): return ((prevs_hat - prevs_true) ** 2).mean(axis=-1) +def sre(prevs_true, prevs_hat, prevs_train, eps=0.): + """ + The squared ratio error is defined as + :math:`SRE(p,\\hat{p},p^{tr})=\\frac{1}{|\\mathcal{Y}|}\\sum_{i \\in \\mathcal{Y}}(w_i-\\hat{w}_i)^2`, + where + :math:`w_i=\\frac{p_i}{p^{tr}_i}=\\frac{Q(Y=i)}{P(Y=i)}`, + and `\\hat{w}_i` is the estimate obtained by replacing the true test prior with an estimate, and + :math:`\\mathcal{Y}` are the classes of interest + :param prevs_true: array-like, true prevalence values + :param prevs_hat: array-like, estimated prevalence values + :param prevs_train: array-like, training prevalence values + :param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the + training prevalence is expected to be >0 everywhere. + :return: the squared ratio error + """ + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + prevs_train = np.asarray(prevs_train) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape=} vs {prevs_hat.shape=}' + assert prevs_true.shape[-1]==prevs_train.shape[-1], 'wrong shape for training prevalence' + if eps>0: + prevs_true = smooth(prevs_true, eps) + prevs_hat = smooth(prevs_hat, eps) + prevs_train = smooth(prevs_train, eps) + + N = prevs_true.shape[-1] + w = prevs_true / prevs_train + w_hat = prevs_hat / prevs_train + return (1./N) * (w - w_hat)**2. + + +def msre(prevs_true, prevs_hat, prevs_train, eps=0.): + """ + Returns the mean across all experiments. See :function:`sre`. + :param prevs_true: array-like, true prevalence values of shape (n_experiments, n_classes,) or (n_classes,) + :param prevs_hat: array-like, estimated prevalence values of shape equal to prevs_true + :param prevs_train: array-like, training prevalence values of (n_experiments, n_classes,) or (n_classes,) + :param eps: float, for smoothing the prevalence values. It is 0 by default (no smoothing), meaning the + training prevalence is expected to be >0 everywhere. + :return: the squared ratio error + """ + return np.mean(squared_ratio_error(prevs_true, prevs_hat, prevs_train, eps)) + + +def dist_aitchison(prevs_true, prevs_hat): + clr = CLRtransformation() + return np.linalg.norm(clr(prevs_true) - clr(prevs_hat), axis=-1) + + +def mean_dist_aitchison(prevs_true, prevs_hat): + return np.mean(dist_aitchison(prevs_true, prevs_hat)) + + def mkld(prevs_true, prevs_hat, eps=None): """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the sample pairs. The distributions are smoothed using the `eps` factor @@ -374,8 +428,8 @@ def __check_eps(eps=None): CLASSIFICATION_ERROR = {f1e, acce} -QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld} -QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld} +QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, msre, mean_dist_aitchison} +QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, sre, dist_aitchison} QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae} CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR} QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR} @@ -387,6 +441,7 @@ ERROR_NAMES = \ f1_error = f1e acc_error = acce mean_absolute_error = mae +squared_ratio_error = sre absolute_error = ae mean_relative_absolute_error = mrae relative_absolute_error = rae diff --git a/quapy/functional.py b/quapy/functional.py index 408c62a..29fe137 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -1,10 +1,15 @@ import warnings +from abc import ABC, abstractmethod from collections import defaultdict +from functools import lru_cache from typing import Literal, Union, Callable from numpy.typing import ArrayLike import scipy import numpy as np +from scipy.special import softmax + +import quapy as qp # ------------------------------------------------------------------------------------------ @@ -583,8 +588,8 @@ def solve_adjustment( """ Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of `unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of - :math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an - estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`. + :math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an + estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`. :param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j` @@ -649,3 +654,96 @@ def solve_adjustment( raise ValueError(f'unknown {solver=}') +# ------------------------------------------------------------------------------------------ +# Transformations from Compositional analysis +# ------------------------------------------------------------------------------------------ + +class CompositionalTransformation(ABC): + """ + Abstract class of transformations from compositional data. + Basically, callable functions with an "inverse" function. + """ + @abstractmethod + def __call__(self, X): ... + + @abstractmethod + def inverse(self, Z): ... + + EPSILON=1e-6 + + +class CLRtransformation(CompositionalTransformation): + """ + Centered log-ratio (CLR), from compositional analysis + """ + def __call__(self, X): + """ + Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but + actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` + + :param X: np.ndarray of (n_instances, n_dimensions) to be transformed + :param epsilon: small float for prevalence smoothing + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + X = np.asarray(X) + X = qp.error.smooth(X, self.EPSILON) + G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean + return np.log(X / G) + + def inverse(self, Z): + """ + Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. + + :param Z: np.ndarray of (n_instances, n_dimensions) to be transformed + :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + """ + return scipy.special.softmax(Z, axis=-1) + + +class ILRtransformation(CompositionalTransformation): + """ + Isometric log-ratio (ILR), from compositional analysis + """ + def __call__(self, X): + X = np.asarray(X) + X = qp.error.smooth(X, self.EPSILON) + k = X.shape[-1] + V = self.get_V(k) # (k-1, k) + logp = np.log(X) + return logp @ V.T + + def inverse(self, Z): + Z = np.asarray(Z) + # get dimension + k_minus_1 = Z.shape[-1] + k = k_minus_1 + 1 + V = self.get_V(k) # (k-1, k) + logp = Z @ V + p = np.exp(logp) + p = p / np.sum(p, axis=-1, keepdims=True) + return p + + @lru_cache(maxsize=None) + def get_V(self, k): + def helmert_matrix(k): + """ + Returns the (k x k) Helmert matrix. + """ + H = np.zeros((k, k)) + for i in range(1, k): + H[i, :i] = 1 + H[i, i] = -(i) + H[i] = H[i] / np.sqrt(i * (i + 1)) + # row 0 stays zeros; will be discarded + return H + + def ilr_basis(k): + """ + Constructs an orthonormal ILR basis using the Helmert submatrix. + Output shape: (k-1, k) + """ + H = helmert_matrix(k) + V = H[1:, :] # remove first row of zeros + return V + + return ilr_basis(k) diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py index da65eed..23507f7 100644 --- a/quapy/method/_bayesian.py +++ b/quapy/method/_bayesian.py @@ -1,6 +1,10 @@ """ Utility functions for `Bayesian quantification `_ methods. """ +import contextlib +import os +import sys + import numpy as np import importlib.resources @@ -10,6 +14,8 @@ try: import numpyro import numpyro.distributions as dist import stan + import logging + import stan.common DEPENDENCIES_INSTALLED = True except ImportError: @@ -86,6 +92,18 @@ def sample_posterior( def load_stan_file(): return importlib.resources.files('quapy.method').joinpath('stan/pq.stan').read_text(encoding='utf-8') + +@contextlib.contextmanager +def _suppress_stan_logging(): + with open(os.devnull, "w") as devnull: + old_stderr = sys.stderr + sys.stderr = devnull + try: + yield + finally: + sys.stderr = old_stderr + + def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed): """ Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification. @@ -121,6 +139,8 @@ def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, Each element corresponds to one draw from the posterior distribution. """ + logging.getLogger("stan.common").setLevel(logging.ERROR) + stan_data = { 'n_bucket': n_bins, 'train_neg': neg_hist.tolist(), @@ -129,7 +149,8 @@ def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, 'posterior': 1 } - stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed) - fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup) + with _suppress_stan_logging(): + stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed) + fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup) return fit['prev'] diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index f004c1a..2f12eb8 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -15,9 +15,11 @@ class KDEBase: """ BANDWIDTH_METHOD = ['scott', 'silverman'] + KERNELS = ['gaussian', 'aitchison', 'ilr'] + @classmethod - def _check_bandwidth(cls, bandwidth): + def _check_bandwidth(cls, bandwidth, kernel): """ Checks that the bandwidth parameter is correct @@ -27,32 +29,56 @@ class KDEBase: assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): - assert 0 < bandwidth < 1, \ - "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + assert kernel!='gaussian' or (0 < bandwidth < 1), \ + ("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), " + "since this method models the unit simplex") return bandwidth - def get_kde_function(self, X, bandwidth): + @classmethod + def _check_kernel(cls, kernel): + """ + Checks that the kernel parameter is correct + + :param kernel: str + :return: the validated kernel + """ + assert kernel in KDEBase.KERNELS, f'unknown {kernel=}' + return kernel + + def get_kde_function(self, X, bandwidth, kernel): """ Wraps the KDE function from scikit-learn. :param X: data for which the density function is to be estimated :param bandwidth: the bandwidth of the kernel + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: a scikit-learn's KernelDensity object """ + if kernel == 'aitchison': + X = self.clr_transform(X) + elif kernel == 'ilr': + X = self.ilr_transform(X) + return KernelDensity(bandwidth=bandwidth).fit(X) - def pdf(self, kde, X): + def pdf(self, kde, X, kernel): """ Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this function returns :math:`e^{s}` :param kde: a previously fit KDE function :param X: the data for which the density is to be estimated + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: np.ndarray with the densities """ + if kernel == 'aitchison': + X = self.clr_transform(X) + elif kernel == 'ilr': + X = self.ilr_transform(X) + return np.exp(kde.score_samples(X)) - def get_mixture_components(self, X, y, classes, bandwidth): + def get_mixture_components(self, X, y, classes, bandwidth, kernel): """ Returns an array containing the mixture components, i.e., the KDE functions for each class. @@ -60,6 +86,7 @@ class KDEBase: :param y: the class labels :param n_classes: integer, the number of classes :param bandwidth: float, the bandwidth of the kernel + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates """ class_cond_X = [] @@ -67,8 +94,19 @@ class KDEBase: selX = X[y==cat] if selX.size==0: selX = [F.uniform_prevalence(len(classes))] + class_cond_X.append(selX) - return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] + return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X] + + def clr_transform(self, X): + if not hasattr(self, 'clr'): + self.clr = F.CLRtransformation() + return self.clr(X) + + def ilr_transform(self, X): + if not hasattr(self, 'ilr'): + self.ilr = F.ILRtransformation() + return self.ilr(X) class KDEyML(AggregativeSoftQuantifier, KDEBase): @@ -107,17 +145,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. :param bandwidth: float, the bandwidth of the Kernel + :param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS :param random_state: a seed to be set before fitting any base quantifier (default None) """ - def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian', random_state=None): super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) + self.kernel = self._check_kernel(kernel) self.random_state=random_state def aggregation_fit(self, classif_predictions, labels): - self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth) + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel) return self def aggregate(self, posteriors: np.ndarray): @@ -131,7 +171,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): with qp.util.temp_seed(self.random_state): epsilon = 1e-10 n_classes = len(self.mix_densities) - test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities] def neg_loglikelihood(prev): # test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) @@ -192,7 +232,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase): super().__init__(classifier, fit_classifier, val_split) self.divergence = divergence - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian') self.random_state=random_state self.montecarlo_trials = montecarlo_trials @@ -279,7 +319,7 @@ class KDEyCS(AggregativeSoftQuantifier): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1): super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian') def gram_matrix_mix_sum(self, X, Y=None): # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 25fc1ef..d565140 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -673,7 +673,7 @@ class PACC(AggregativeSoftQuantifier): class EMQ(AggregativeSoftQuantifier): """ `Expectation Maximization for Quantification `_ (EMQ), - aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. + aka `Saerens-Latinne-Decaestecker` (SLD) algorithm, or `Maximum Likelihood Label Shif` (MLLS). EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via maximum-likelihood estimation, in a mutually recursive way, until convergence. diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index fd8c84d..914b814 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -1,18 +1,21 @@ import numpy as np +from joblib import Parallel, delayed from sklearn.base import BaseEstimator from sklearn.metrics import confusion_matrix import quapy as qp import quapy.functional as F +from quapy.functional import CompositionalTransformation, CLRtransformation, ILRtransformation from quapy.method import _bayesian from quapy.data import LabelledCollection from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier from scipy.stats import chi2 from sklearn.utils import resample from abc import ABC, abstractmethod -from scipy.special import softmax, factorial +from scipy.special import factorial import copy from functools import lru_cache +from tqdm import tqdm """ This module provides implementation of different types of confidence regions, and the implementation of Bootstrap @@ -79,6 +82,58 @@ class ConfidenceRegionABC(ABC): proportion = np.clip(self.coverage(uniform_simplex), 0., 1.) return proportion + @property + @abstractmethod + def samples(self): + """ Returns internal samples """ + ... + + def __contains__(self, p): + """ + Overloads in operator, checks if `p` is contained in the region + + :param p: array-like + :return: boolean + """ + p = np.asarray(p) + assert p.ndim==1, f'unexpected shape for point parameter' + return self.coverage(p)==1. + + def closest_point_in_region(self, p, tol=1e-6, max_iter=30): + """ + Finds the closes point to p that belongs to the region. Assumes the region is convex. + + :param p: array-like, the point + :param tol: float, error tolerance + :param max_iter: int, max number of iterations + :returns: array-like, the closes point to p in the segment between p and the center of the region, that + belongs to the region + """ + p = np.asarray(p, dtype=float) + + # if p in region, returns p itself + if p in self: + return p.copy() + + # center of the region + c = self.point_estimate() + + # binary search in [0,1], interpolation parameter + # low=closest to p, high=closest to c + low, high = 0.0, 1.0 + for _ in range(max_iter): + mid = 0.5 * (low + high) + x = p*(1-mid) + c*mid + if x in self: + high = mid + else: + low = mid + if high - low < tol: + break + + in_boundary = p*(1-high) + c*high + return in_boundary + class WithConfidenceABC(ABC): """ @@ -112,7 +167,7 @@ class WithConfidenceABC(ABC): return self.predict_conf(instances=instances, confidence_level=confidence_level) @classmethod - def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'): + def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals')->ConfidenceRegionABC: """ Construct a confidence region given many prevalence estimations. @@ -147,7 +202,7 @@ def simplex_volume(n): return 1 / factorial(n) -def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): +def within_ellipse_prop__(values, mean, prec_matrix, chi2_critical): """ Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix` at a distance `chi2_critical`. @@ -180,111 +235,125 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): return within_elipse * 1.0 -class ConfidenceEllipseSimplex(ConfidenceRegionABC): +def within_ellipse_prop(values, mean, prec_matrix, chi2_critical): """ - Instantiates a Confidence Ellipse in the probability simplex. + Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix` + at a distance `chi2_critical`. - :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) - :param confidence_level: float, the confidence level (default 0.95) + :param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,) + :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse + :param prec_matrix: a np.ndarray with the precision matrix (inverse of the + covariance matrix) of the ellipse. If this inverse cannot be computed + then None must be passed + :param chi2_critical: float, the chi2 critical value + + :return: float in [0,1], the fraction of values that are contained in the ellipse + defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance). + If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned. + """ + if prec_matrix is None: + return 0. + + values = np.atleast_2d(values) + diff = values - mean + d_M_squared = np.sum(diff @ prec_matrix * diff, axis=-1) + within_ellipse = d_M_squared <= chi2_critical + + if len(within_ellipse) == 1: + return float(within_ellipse[0]) + else: + return float(np.mean(within_ellipse)) + + +def closest_point_on_ellipsoid(p, mean, cov, chi2_critical, tol=1e-9, max_iter=100): + """ + Computes the closest point on the ellipsoid defined by: + (x - mean)^T cov^{-1} (x - mean) = chi2_critical """ - def __init__(self, X, confidence_level=0.95): + p = np.asarray(p) + mean = np.asarray(mean) + Sigma = np.asarray(cov) - assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)' + # Precompute precision matrix + P = np.linalg.pinv(Sigma) + d = P.shape[0] - X = np.asarray(X) + # Define v = p - mean + v = p - mean - self.mean_ = X.mean(axis=0) - self.cov_ = np.cov(X, rowvar=False, ddof=1) + # If p is inside the ellipsoid, return p itself + M_dist = v @ P @ v + if M_dist <= chi2_critical: + return p.copy() - try: - self.precision_matrix_ = np.linalg.inv(self.cov_) - except: - self.precision_matrix_ = None + # Function to compute x(lambda) + def x_lambda(lmbda): + A = np.eye(d) + lmbda * P + return mean + np.linalg.solve(A, v) - self.dim = X.shape[-1] - self.ddof = self.dim - 1 + # Function whose root we want: f(lambda) = Mahalanobis distance - chi2 + def f(lmbda): + x = x_lambda(lmbda) + diff = x - mean + return diff @ P @ diff - chi2_critical - # critical chi-square value - self.confidence_level = confidence_level - self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof) + # Bisection search over lambda >= 0 + l_low, l_high = 0.0, 1.0 - def point_estimate(self): - """ - Returns the point estimate, the center of the ellipse. + # Increase high until f(high) < 0 + while f(l_high) > 0: + l_high *= 2 + if l_high > 1e12: + raise RuntimeError("Failed to bracket the root.") - :return: np.ndarray of shape (n_classes,) - """ - return self.mean_ + # Bisection + for _ in range(max_iter): + l_mid = 0.5 * (l_low + l_high) + fm = f(l_mid) + if abs(fm) < tol: + break + if fm > 0: + l_low = l_mid + else: + l_high = l_mid - def coverage(self, true_value): - """ - Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the - fraction of these that are contained in the region, if more than one value is passed. If only one value is - passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. - - :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) - :return: float in [0,1] - """ - return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_) - - -class ConfidenceEllipseCLR(ConfidenceRegionABC): - """ - Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. - - :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) - :param confidence_level: float, the confidence level (default 0.95) - """ - - def __init__(self, X, confidence_level=0.95): - X = np.asarray(X) - self.clr = CLRtransformation() - Z = self.clr(X) - self.mean_ = np.mean(X, axis=0) - self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) - - def point_estimate(self): - """ - Returns the point estimate, the center of the ellipse. - - :return: np.ndarray of shape (n_classes,) - """ - # The inverse of the CLR does not coincide with the true mean, because the geometric mean - # requires smoothing the prevalence vectors and this affects the softmax (inverse); - # return self.clr.inverse(self.mean_) # <- does not coincide - return self.mean_ - - def coverage(self, true_value): - """ - Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the - fraction of these that are contained in the region, if more than one value is passed. If only one value is - passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. - - :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) - :return: float in [0,1] - """ - transformed_values = self.clr(true_value) - return self.conf_region_clr.coverage(transformed_values) + l_opt = l_mid + return x_lambda(l_opt) class ConfidenceIntervals(ConfidenceRegionABC): """ Instantiates a region based on (independent) Confidence Intervals. - :param X: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) :param confidence_level: float, the confidence level (default 0.95) + :param bonferroni_correction: bool (default False), if True, a Bonferroni correction + is applied to the significance level (`alpha`) before computing confidence intervals. + The correction consists of replacing `alpha` with `alpha/n_classes`. When + `n_classes=2` the correction is not applied because there is only one verification test + since the other class is constrained. This is not necessarily true for n_classes>2. """ - def __init__(self, X, confidence_level=0.95): + def __init__(self, samples, confidence_level=0.95, bonferroni_correction=False): assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)' - X = np.asarray(X) + samples = np.asarray(samples) - self.means_ = X.mean(axis=0) + self.means_ = samples.mean(axis=0) alpha = 1-confidence_level + if bonferroni_correction: + n_classes = samples.shape[-1] + if n_classes>2: + alpha = alpha/n_classes low_perc = (alpha/2.)*100 high_perc = (1-alpha/2.)*100 - self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0) + self.I_low, self.I_high = np.percentile(samples, q=[low_perc, high_perc], axis=0) + self._samples = samples + self.alpha = alpha + + @property + def samples(self): + return self._samples def point_estimate(self): """ @@ -312,33 +381,171 @@ class ConfidenceIntervals(ConfidenceRegionABC): def __repr__(self): return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']' + @property + def n_dim(self): + return len(self.I_low) -class CLRtransformation: + def winkler_scores(self, true_prev): + true_prev = np.asarray(true_prev) + assert true_prev.ndim == 1, 'unexpected dimensionality for true_prev' + assert len(true_prev)==self.n_dim, \ + f'unexpected number of dimensions; found {true_prev.ndim}, expected {self.n_dim}' + + def winkler_score(low, high, true_val, alpha): + amp = high-low + scale_cost = 1./alpha + cost = np.max([0, low-true_val], axis=0) + np.max([0, true_val-high], axis=0) + return amp + scale_cost*cost + + return np.asarray( + [winkler_score(low_i, high_i, true_v, self.alpha) + for (low_i, high_i, true_v) in zip(self.I_low, self.I_high, true_prev)] + ) + + def mean_winkler_score(self, true_prev): + return np.mean(self.winkler_scores(true_prev)) + + +class ConfidenceEllipseSimplex(ConfidenceRegionABC): """ - Centered log-ratio, from component analysis + Instantiates a Confidence Ellipse in the probability simplex. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) """ - def __call__(self, X, epsilon=1e-6): - """ - Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but - actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}` - :param X: np.ndarray of (n_instances, n_dimensions) to be transformed - :param epsilon: small float for prevalence smoothing - :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points - """ - X = np.asarray(X) - X = qp.error.smooth(X, epsilon) - G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean - return np.log(X / G) + def __init__(self, samples, confidence_level=0.95): - def inverse(self, X): - """ - Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing. + assert 0. < confidence_level < 1., f'{confidence_level=} must be in range(0,1)' - :param X: np.ndarray of (n_instances, n_dimensions) to be transformed - :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points + samples = np.asarray(samples) + + self.mean_ = samples.mean(axis=0) + self.cov_ = np.cov(samples, rowvar=False, ddof=1) + + try: + self.precision_matrix_ = np.linalg.pinv(self.cov_) + except: + self.precision_matrix_ = None + + self.dim = samples.shape[-1] + self.ddof = self.dim - 1 + + # critical chi-square value + self.confidence_level = confidence_level + self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof) + self._samples = samples + self.alpha = 1.-confidence_level + + @property + def samples(self): + return self._samples + + def point_estimate(self): """ - return softmax(X, axis=-1) + Returns the point estimate, the center of the ellipse. + + :return: np.ndarray of shape (n_classes,) + """ + return self.mean_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_) + + def closest_point_in_region(self, p, tol=1e-6, max_iter=30): + return closest_point_on_ellipsoid( + p, + mean=self.mean_, + cov=self.cov_, + chi2_critical=self.chi2_critical_ + ) + + +class ConfidenceEllipseTransformed(ConfidenceRegionABC): + """ + Instantiates a Confidence Ellipse in a transformed space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + + def __init__(self, samples, transformation: CompositionalTransformation, confidence_level=0.95): + samples = np.asarray(samples) + self.transformation = transformation + Z = self.transformation(samples) + # self.mean_ = np.mean(samples, axis=0) + self.mean_ = self.transformation.inverse(np.mean(Z, axis=0)) + self.conf_region_z = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level) + self._samples = samples + self.alpha = 1.-confidence_level + + @property + def samples(self): + return self._samples + + def point_estimate(self): + """ + Returns the point estimate, the center of the ellipse. + + :return: np.ndarray of shape (n_classes,) + """ + # The inverse of the CLR does not coincide with the true mean, because the geometric mean + # requires smoothing the prevalence vectors and this affects the softmax (inverse); + # return self.clr.inverse(self.mean_) # <- does not coincide + return self.mean_ + + def coverage(self, true_value): + """ + Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the + fraction of these that are contained in the region, if more than one value is passed. If only one value is + passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively. + + :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,) + :return: float in [0,1] + """ + transformed_values = self.transformation(true_value) + return self.conf_region_z.coverage(transformed_values) + + def closest_point_in_region(self, p, tol=1e-6, max_iter=30): + p_prime = self.transformation(p) + b_prime = self.conf_region_z.closest_point_in_region(p_prime, tol=tol, max_iter=max_iter) + b = self.transformation.inverse(b_prime) + return b + + +class ConfidenceEllipseCLR(ConfidenceEllipseTransformed): + """ + Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, samples, confidence_level=0.95): + super().__init__(samples, CLRtransformation(), confidence_level=confidence_level) + + +class ConfidenceEllipseILR(ConfidenceEllipseTransformed): + """ + Instantiates a Confidence Ellipse in the Isometric-Log Ratio (CLR) space. + + :param samples: np.ndarray of shape (n_bootstrap_samples, n_classes) + :param confidence_level: float, the confidence level (default 0.95) + """ + def __init__(self, samples, confidence_level=0.95): + super().__init__(samples, ILRtransformation(), confidence_level=confidence_level) + + + + + class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): @@ -378,7 +585,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): n_test_samples=500, confidence_level=0.95, region='intervals', - random_state=None): + random_state=None, + verbose=False): assert isinstance(quantifier, AggregativeQuantifier), \ f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}' @@ -395,6 +603,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): self.confidence_level = confidence_level self.region = region self.random_state = random_state + self.verbose = verbose def aggregation_fit(self, classif_predictions, labels): data = LabelledCollection(classif_predictions, labels, classes=self.classes_) @@ -420,6 +629,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): prev_mean, self.confidence = self.aggregate_conf(classif_predictions) return prev_mean + def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None): + if confidence_level is None: + confidence_level = self.confidence_level + + n_samples = classif_predictions.shape[0] + prevs = [] + with qp.util.temp_seed(self.random_state): + for quantifier in self.quantifiers: + for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose): + sample_i = resample(classif_predictions, n_samples=n_samples) + prev_i = quantifier.aggregate(sample_i) + prevs.append(prev_i) + + conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None): if confidence_level is None: confidence_level = self.confidence_level @@ -428,10 +655,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): prevs = [] with qp.util.temp_seed(self.random_state): for quantifier in self.quantifiers: - for i in range(self.n_test_samples): - sample_i = resample(classif_predictions, n_samples=n_samples) - prev_i = quantifier.aggregate(sample_i) - prevs.append(prev_i) + results = Parallel(n_jobs=-1)( + delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples) + for i in range(self.n_test_samples) + ) + prevs.extend(results) + # for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose): + # sample_i = resample(classif_predictions, n_samples=n_samples) + # prev_i = quantifier.aggregate(sample_i) + # prevs.append(prev_i) conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) prev_estim = conf.point_estimate() @@ -456,6 +688,13 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): return self.quantifier._classifier_method() +def bootstrap_once(i, classif_predictions, quantifier, n_samples): + idx = np.random.randint(0, len(classif_predictions), n_samples) + sample = classif_predictions[idx] + prev = quantifier.aggregate(sample) + return prev + + class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ `Bayesian quantification `_ method (by Albert Ziegler and Paweł Czyż), @@ -603,7 +842,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): classifier: BaseEstimator=None, fit_classifier=True, val_split: int = 5, - n_bins: int = 4, + nbins: int = 4, fixed_bins: bool = False, num_warmup: int = 500, num_samples: int = 1_000, @@ -621,7 +860,8 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): "Run `$ pip install quapy[bayes]` to install them.") super().__init__(classifier, fit_classifier, val_split) - self.n_bins = n_bins + + self.nbins = nbins self.fixed_bins = fixed_bins self.num_warmup = num_warmup self.num_samples = num_samples @@ -636,10 +876,10 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): # Compute bin limits if self.fixed_bins: # Uniform bins in [0,1] - self.bin_limits = np.linspace(0, 1, self.n_bins + 1) + self.bin_limits = np.linspace(0, 1, self.nbins + 1) else: # Quantile bins - self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.n_bins + 1)) + self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.nbins + 1)) # Assign each prediction to a bin bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True) @@ -649,14 +889,14 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): neg_mask = ~pos_mask # Count positives and negatives per bin - self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.n_bins) - self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.n_bins) + self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.nbins) + self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.nbins) def aggregate(self, classif_predictions): Px_test = classif_predictions[:, self.pos_label] test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) prevs = _bayesian.pq_stan( - self.stan_code, self.n_bins, self.pos_hist, self.neg_hist, test_hist, + self.stan_code, self.nbins, self.pos_hist, self.neg_hist, test_hist, self.num_samples, self.num_warmup, self.stan_seed ).flatten() self.prev_distribution = np.vstack([1-prevs, prevs]).T diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 0937fa8..c357082 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -410,7 +410,7 @@ def group_params(param_grid: dict): """ classifier_params, quantifier_params = {}, {} for key, values in param_grid.items(): - if key.startswith('classifier__') or key == 'val_split': + if 'classifier__' in key or key == 'val_split': classifier_params[key] = values else: quantifier_params[key] = values