From dcfd61ae5b777a7d42dcf69240afd61c51e17239 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 22 Nov 2023 12:27:15 +0100 Subject: [PATCH] launching new KDEy-HD with importance sampling --- distribution_matching/commons.py | 9 +- distribution_matching/method_kdey.py | 144 ++++++++++++--------------- quapy/functional.py | 20 ++++ 3 files changed, 93 insertions(+), 80 deletions(-) diff --git a/distribution_matching/commons.py b/distribution_matching/commons.py index bc9c833..75475e2 100644 --- a/distribution_matching/commons.py +++ b/distribution_matching/commons.py @@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy from sklearn.linear_model import LogisticRegression from method_kdey_closed_efficient import KDEyclosed_efficient -METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'DM-CS', 'KDEy-closed++', 'DIR', 'EMQ', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', 'EMQ-C', +METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'KDEy-DMhd4', 'DM-CS', 'KDEy-closed++', 'DIR', 'EMQ', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', 'EMQ-C', BIN_METHODS = [x.replace('-OvA', '') for x in METHODS] @@ -129,6 +129,13 @@ def new_method(method, **lr_kwargs): method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} param_grid = {**method_params, **hyper_LR} quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10) + elif method in ['KDEy-DMhd4']: + # This is the new version in which we apply importance sampling, i.e., we compute: + # D(p_a||q) = 1/N sum_x f(p(x)/q(x)) * (q(x)/r(x)) + # where x ~iid r, with r = p_u, and u = (1/n, 1/n, ..., 1/n) the uniform vector + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10) elif method == 'DM-HD': method_params = { 'nbins': [4,8,16,32], diff --git a/distribution_matching/method_kdey.py b/distribution_matching/method_kdey.py index 7f66179..485b4aa 100644 --- a/distribution_matching/method_kdey.py +++ b/distribution_matching/method_kdey.py @@ -14,9 +14,10 @@ from quapy.data import LabelledCollection from quapy.protocol import APP, UPP from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \ DistributionMatching, _get_divergence +import quapy.functional as F import scipy from scipy import optimize -from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional +#from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional # TODO: optimize the bandwidth automatically @@ -97,11 +98,7 @@ class KDEy(AggregativeProbabilisticQuantifier): if self.engine == 'scipy': return kde(posteriors[:, :-1].T) elif self.engine == 'sklearn': - #print('pdf...') - densities = np.exp(kde.score_samples(posteriors)) - #print('[pdf done]') - return densities - #return np.exp(kde.score_samples(posteriors)) + return np.exp(kde.score_samples(posteriors)) def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): """ @@ -130,6 +127,12 @@ class KDEy(AggregativeProbabilisticQuantifier): self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials) self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities] elif self.target == 'min_divergence': + N = self.montecarlo_trials + rs = self.random_state + self.reference_samples = np.vstack([kde_i.sample(N, random_state=rs) for kde_i in self.val_densities]) + self.reference_classwise_densities = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]) + self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities) + elif self.target == 'min_divergence_deprecated': # the version of the first draft, with n*N presampled, then alpha*N chosen for class self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities] self.class_sample_densities = {} for ci, samples_i in enumerate(self.class_samples): @@ -148,66 +151,40 @@ class KDEy(AggregativeProbabilisticQuantifier): raise ValueError('unknown target') - # this is the variant I have in the current results, which I think is bugged - # def _target_divergence_depr(self, posteriors): - # # in this variant we evaluate the divergence using a Montecarlo approach - # n_classes = len(self.val_densities) - # - # test_kde = self.get_kde_function(posteriors) - # test_likelihood = self.pdf(test_kde, self.samples) - # - # divergence = _get_divergence(self.divergence) - # - # def match(prev): - # val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities)) - # return divergence(val_likelihood, test_likelihood) - # - # # the initial point is set as the uniform distribution - # uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - # - # # solutions are bounded to those contained in the unit-simplex - # bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] - # constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - # r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - # return r.x - - def _target_divergence_uniform(self, posteriors): + # new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then + # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS, and not D(q||p_alpha) as + # in the first draft + def _target_divergence(self, posteriors): # in this variant we evaluate the divergence using a Montecarlo approach n_classes = len(self.val_densities) test_kde = self.get_kde_function(posteriors) - test_likelihood = self.pdf(test_kde, self.samples) + test_densities = self.pdf(test_kde, self.reference_samples) - def f_squared_hellinger(t): - return (np.sqrt(t) - 1)**2 - - def f_jensen_shannon(t): - return -(t+1)*np.log((t+1)/2) + t*np.log(t) - - def fdivergence(pi, qi, f, eps=1e-10): - spi = pi+eps - sqi = qi+eps - return np.mean(f(spi/sqi)*sqi) + def f_squared_hellinger(u): + return (np.sqrt(u)-1)**2 + # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway if self.divergence.lower() == 'hd': f = f_squared_hellinger - elif self.divergence.lower() == 'js': - f = f_jensen_shannon + else: + raise ValueError('only squared HD is currently implemented') - def match(prev): - val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities)) - return fdivergence(val_likelihood, test_likelihood, f) - # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + epsilon = 1e-10 + qs = test_densities + epsilon + rs = self.reference_density + epsilon - # solutions are bounded to those contained in the unit-simplex - bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] - constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - return r.x - def _target_divergence(self, posteriors): + def divergence(prev): + ps = prev @ self.reference_classwise_densities + epsilon + return np.mean( f(ps/qs) * (qs/rs) ) + + return F.optim_minimize(divergence, n_classes) + + + # the first version we explain in the draft, choosing alpha*N from a pool of N for each class and w/o importance sampling + def _target_divergence_deprecated(self, posteriors): # in this variant we evaluate the divergence using a Montecarlo approach n_classes = len(self.val_densities) @@ -233,7 +210,7 @@ class KDEy(AggregativeProbabilisticQuantifier): fdivergence = squared_hellinger - def match(prev): + def dissimilarity(prev): # choose the samples according to the prevalence vector # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2 # the points are already pre-sampled and de densities are pre-computed, so that now all that remains @@ -255,14 +232,38 @@ class KDEy(AggregativeProbabilisticQuantifier): # then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))] return fdivergence(test_likelihood, val_likelihood) - # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + return F.optim_minimize(dissimilarity, n_classes) - # solutions are bounded to those contained in the unit-simplex - bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] - constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - return r.x + # this version explores the entire simplex, and then applies importance sampling. We have not really tested it in deep but + # seems not to be promising + def _target_divergence_uniform(self, posteriors): + # in this variant we evaluate the divergence using a Montecarlo approach + n_classes = len(self.val_densities) + + test_kde = self.get_kde_function(posteriors) + test_likelihood = self.pdf(test_kde, self.samples) + + def f_squared_hellinger(t): + return (np.sqrt(t) - 1)**2 + + def f_jensen_shannon(t): + return -(t+1)*np.log((t+1)/2) + t*np.log(t) + + def fdivergence(pi, qi, f, eps=1e-10): + spi = pi+eps + sqi = qi+eps + return np.mean(f(spi/sqi)*sqi) + + if self.divergence.lower() == 'hd': + f = f_squared_hellinger + elif self.divergence.lower() == 'js': + f = f_jensen_shannon + + def dissimilarity(prev): + val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities)) + return fdivergence(val_likelihood, test_likelihood, f) + + return F.optim_minimize(dissimilarity, n_classes) def _target_likelihood(self, posteriors, eps=0.000001): """ @@ -278,24 +279,9 @@ class KDEy(AggregativeProbabilisticQuantifier): #return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) def neg_loglikelihood(prev): - #print('-neg_likelihood') - #val_pdf = self.val_pdf(prev) - #test_likelihood = val_pdf(posteriors) test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) test_loglikelihood = np.log(test_mixture_likelihood + eps) - neg_log_likelihood = -np.sum(test_loglikelihood) - #print('-neg_likelihood [done!]') - return neg_log_likelihood - #return -np.prod(test_likelihood) + return -np.sum(test_loglikelihood) - # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - - # solutions are bounded to those contained in the unit-simplex - bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] - constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - #print('searching for alpha') - r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - #print('[optimization ended]') - return r.x + return F.optim_minimize(neg_loglikelihood, n_classes) diff --git a/quapy/functional.py b/quapy/functional.py index 843a450..4221ed3 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -298,3 +298,23 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08): return True +def optim_minimize(loss, n_classes): + """ + Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex + that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's + SLSQP routine. + + :param loss: (callable) the function to minimize + :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector + :return: (ndarray) the best prevalence vector found + """ + from scipy import optimize + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x