diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py index 228f547..419b974 100644 --- a/BayesianKDEy/bayesian_kdey.py +++ b/BayesianKDEy/bayesian_kdey.py @@ -7,9 +7,10 @@ from quapy.protocol import UPP import quapy.functional as F import numpy as np from tqdm import tqdm +from scipy.stats import dirichlet -def bayesian(kdes, data, prob_classifier, init=None, MAX_ITER=100_000, warmup=3_000): +def bayesian(kdes, data, probabilistic_classifier, init=None, MAX_ITER=100_000, warmup=3_000): """ Bayes: P(prev|data) = P(data|prev) P(prev) / P(data) @@ -23,26 +24,32 @@ def bayesian(kdes, data, prob_classifier, init=None, MAX_ITER=100_000, warmup=3_ # todo: remove exp, since we are then doing the log every time? (not sure) return np.exp(kde.score_samples(X)) - X = prob_classifier.predict_proba(data) - test_densities = [pdf(kde_i, X) for kde_i in kdes] + X = probabilistic_classifier.predict_proba(data) + test_densities = np.asarray([pdf(kde_i, X) for kde_i in kdes]) - def log_likelihood(prev, epsilon=1e-8): - # test_likelihoods = sum(prev_i*dens_i for prev_i, dens_i in zip (prev, test_densities)) + def log_likelihood(prev, epsilon=1e-10): test_likelihoods = prev @ test_densities test_loglikelihood = np.log(test_likelihoods + epsilon) return np.sum(test_loglikelihood) - def log_prior(prev): + # def log_prior(prev): # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) - return 1 # it is not 1 but we assume uniform, son anyway is an useless constant + # return 1/np.sum((prev-init)**2) # it is not 1 but we assume uniform, son anyway is an useless constant + + # def log_prior(prev, alpha_scale=1000): + # alpha = np.array(init) * alpha_scale + # return dirichlet.logpdf(prev, alpha) + + def log_prior(prev): + return 0 def sample_neighbour(prev): - rand_prev = F.uniform_prevalence_sampling(n_classes=len(prev), size=1) - rand_direction = rand_prev - prev - neighbour = F.normalize_prevalence(prev + rand_direction*0.05) + dir_noise = np.random.normal(scale=0.05, size=len(prev)) + # neighbour = F.normalize_prevalence(prev + dir_noise, method='clip') + neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') return neighbour - n_classes = len(prob_classifier.classes_) + n_classes = len(probabilistic_classifier.classes_) current_prev = F.uniform_prevalence(n_classes) if init is None else init current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) @@ -74,23 +81,25 @@ if __name__ == '__main__': kdey = KDEyML(cls) train, test = qp.datasets.fetch_UCIMulticlassDataset('waveform-v1', standardize=True).train_test + # train, test = qp.datasets.fetch_UCIMulticlassDataset('phishing', standardize=True).train_test - with qp.util.temp_seed(0): + with qp.util.temp_seed(2): print('fitting KDEy') kdey.fit(*train.Xy) - shifted = test.sampling(500, *[0.1, 0.1, 0.8]) + shifted = test.sampling(500, *[0.7, 0.1, 0.2]) prev_hat = kdey.predict(shifted.X) mae = qp.error.mae(shifted.prevalence(), prev_hat) print(f'true_prev={strprev(shifted.prevalence())}, prev_hat={strprev(prev_hat)}, {mae=:.4f}') kdes = kdey.mix_densities h = kdey.classifier - samples = bayesian(kdes, shifted.X, h, init=prev_hat, MAX_ITER=100_000, warmup=0) + samples = bayesian(kdes, shifted.X, h, init=None, MAX_ITER=5_000, warmup=1_000) print(f'mean posterior {strprev(samples.mean(axis=0))}') - plot_prev_points(samples, true_prev=shifted.prevalence()) + plot_prev_points(samples, true_prev=shifted.prevalence(), point_estim=prev_hat, train_prev=train.prevalence()) + # plot_prev_points_matplot(samples) # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py index ca94ae1..4d3044b 100644 --- a/BayesianKDEy/plot_simplex.py +++ b/BayesianKDEy/plot_simplex.py @@ -3,7 +3,15 @@ import matplotlib.pyplot as plt from scipy.stats import gaussian_kde -def plot_prev_points(prevs, true_prev): +def plot_prev_points(prevs, true_prev, point_estim, train_prev): + plt.rcParams.update({ + 'font.size': 10, # tamaño base de todo el texto + 'axes.titlesize': 12, # título del eje + 'axes.labelsize': 10, # etiquetas de ejes + 'xtick.labelsize': 8, # etiquetas de ticks + 'ytick.labelsize': 8, + 'legend.fontsize': 9, # leyenda + }) def cartesian(p): dim = p.shape[-1] @@ -17,13 +25,13 @@ def plot_prev_points(prevs, true_prev): v2 = np.array([1, 0]) v3 = np.array([0.5, np.sqrt(3)/2]) - # transform (a,b,c) -> Cartesian coordinates - x, y = cartesian(prevs) - # Plot fig, ax = plt.subplots(figsize=(6, 6)) - ax.scatter(x, y, s=50, alpha=0.05, edgecolors='none') - ax.scatter(*cartesian(true_prev), s=5, alpha=1) + ax.scatter(*cartesian(prevs), s=50, alpha=0.05, edgecolors='none', label='samples') + ax.scatter(*cartesian(prevs.mean(axis=0)), s=10, alpha=1, label='sample-mean', edgecolors='black') + ax.scatter(*cartesian(true_prev), s=10, alpha=1, label='true-prev', edgecolors='black') + ax.scatter(*cartesian(point_estim), s=10, alpha=1, label='KDEy-estim', edgecolors='black') + ax.scatter(*cartesian(train_prev), s=10, alpha=1, label='train-prev', edgecolors='black') # edges triangle = np.array([v1, v2, v3, v1]) @@ -36,6 +44,13 @@ def plot_prev_points(prevs, true_prev): ax.set_aspect('equal') ax.axis('off') + plt.legend( + loc='center left', + bbox_to_anchor=(1.05, 0.5), + # ncol=3, + # frameon=False + ) + plt.tight_layout() plt.show() @@ -50,7 +65,7 @@ def plot_prev_points_matplot(points): # kde xy = np.vstack([x, y]) - kde = gaussian_kde(xy) + kde = gaussian_kde(xy, bw_method=0.25) xmin, xmax = 0, 1 ymin, ymax = 0, np.sqrt(3) / 2 @@ -91,4 +106,7 @@ def plot_prev_points_matplot(points): ax.axis('off') plt.show() - +if __name__ == '__main__': + n = 1000 + points = np.random.dirichlet([2, 3, 4], size=n) + plot_prev_points_matplot(points) diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 5932c06..f941e30 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -190,7 +190,8 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities] def neg_loglikelihood(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + # test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_mixture_likelihood = prev @ test_densities test_loglikelihood = np.log(test_mixture_likelihood + epsilon) return -np.sum(test_loglikelihood)