diff --git a/KDEy/kdey_devel.py b/KDEy/kdey_devel.py index ef16d2a..bc90478 100644 --- a/KDEy/kdey_devel.py +++ b/KDEy/kdey_devel.py @@ -271,7 +271,7 @@ class KDEyMLauto2(KDEyML): self.reduction = reduction self.max_reduced = max_reduced self.random_state = random_state - assert target == 'likelihood' or target in qp.error.QUANTIFICATION_ERROR_NAMES, 'unknown target for auto' + assert target in ['likelihood', 'likelihood+'] or target in qp.error.QUANTIFICATION_ERROR_NAMES, 'unknown target for auto' self.target = target def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): @@ -293,34 +293,60 @@ class KDEyMLauto2(KDEyML): if len(train) > tr_length: train = train.sampling(tr_length) - best_band = None - best_loss_val = None init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - for bandwidth in np.logspace(-4, np.log10(0.2), 20): - mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth) + repeats = 25 + prot = UPP(val, sample_size=self.reduction, repeats=repeats, random_state=self.random_state) - repeats = 25 - loss_accum = 0 - prot = UPP(val, sample_size=self.reduction, repeats=repeats, random_state=self.random_state) - for (sample, prev) in tqdm(prot(), total=repeats): - test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities] + if self.target == 'likelihood+': + def neg_loglikelihood_band_(bandwidth): + mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth) + loss_accum = 0 - def neg_loglikelihood_prev_(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) - test_loglikelihood = np.log(test_mixture_likelihood + epsilon) - return -np.sum(test_loglikelihood) + for (sample, prev) in tqdm(prot(), total=repeats): + test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities] - if self.target == 'likelihood': - loss_fn = neg_loglikelihood_prev_ - else: - loss_fn = lambda prev_hat: qp.error.from_name(self.target)(prev, prev_hat) + def neg_loglikelihood_prev_(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) - pred_prev, loss_val = optim_minimize(loss_fn, init_prev, return_loss=True) - loss_accum += loss_val + pred_prev, loss_val = optim_minimize(neg_loglikelihood_prev_, init_prev, return_loss=True) + loss_accum += loss_val - if best_loss_val is None or loss_accum < best_loss_val: - best_loss_val = loss_accum - best_band = bandwidth + return loss_accum + + bounds = [tuple(0, 1)] + init_bandwidth = 0.1 + r = optimize.minimize(neg_loglikelihood_band_, x0=[init_bandwidth], method='SLSQP', bounds=bounds) + best_band = r.x[0] + + else: + best_band = None + best_loss_val = None + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + for bandwidth in np.logspace(-4, np.log10(0.2), 20): + mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth) + + loss_accum = 0 + for (sample, prev) in tqdm(prot(), total=repeats): + test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities] + + def neg_loglikelihood_prev_(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + if self.target == 'likelihood': + loss_fn = neg_loglikelihood_prev_ + else: + loss_fn = lambda prev_hat: qp.error.from_name(self.target)(prev, prev_hat) + + pred_prev, loss_val = optim_minimize(loss_fn, init_prev, return_loss=True) + loss_accum += loss_val + + if best_loss_val is None or loss_accum < best_loss_val: + best_loss_val = loss_accum + best_band = bandwidth print(f'found bandwidth={best_band:.4f} (loss_val={best_loss_val:.5f})') self.bandwidth_ = best_band diff --git a/KDEy/quantification_evaluation.py b/KDEy/quantification_evaluation.py index fd98dbd..4ad3176 100644 --- a/KDEy/quantification_evaluation.py +++ b/KDEy/quantification_evaluation.py @@ -38,6 +38,7 @@ METHODS = [ ('KDEy-ML-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)), ('KDEy-ML-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)), ('KDEy-ML-autoLike', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood'), wrap_hyper(logreg_grid)), + ('KDEy-ML-autoLike+', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood+'), wrap_hyper(logreg_grid)), ('KDEy-ML-autoAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mae'), wrap_hyper(logreg_grid)), ('KDEy-ML-autoRAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae'), wrap_hyper(logreg_grid)), ] diff --git a/KDEy/quantification_evaluation_debug.py b/KDEy/quantification_evaluation_debug.py index ed2e438..a2f5e69 100644 --- a/KDEy/quantification_evaluation_debug.py +++ b/KDEy/quantification_evaluation_debug.py @@ -14,6 +14,8 @@ from quapy.model_selection import GridSearchQ from quapy.protocol import UPP from pathlib import Path from quapy import functional as F +import matplotlib.pyplot as plt + SEED = 1 @@ -24,84 +26,96 @@ def newLR(): SAMPLE_SIZE=150 qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE +show_ae = True +show_rae = True +show_mse = False +show_kld = True + epsilon = 1e-10 # n_bags_test = 2 # DATASETS = [qp.datasets.UCI_MULTICLASS_DATASETS[21]] DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS for i, dataset in enumerate(DATASETS): - data = qp.datasets.fetch_UCIMulticlassDataset(dataset) - n_classes = data.n_classes - print(f'{i=}') - print(f'{dataset=}') - print(f'{n_classes=}') - print(len(data.training)) - print(len(data.test)) - train, test = data.train_test - train_prev = train.prevalence() - test_prev = test.prevalence() + def generate_data(): + data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + n_classes = data.n_classes + print(f'{i=}') + print(f'{dataset=}') + print(f'{n_classes=}') + print(len(data.training)) + print(len(data.test)) - print(f'train-prev = {F.strprev(train_prev)}') - print(f'test-prev = {F.strprev(test_prev)}') + train, test = data.train_test + train_prev = train.prevalence() + test_prev = test.prevalence() - # protocol = UPP(test, repeats=n_bags_test) - # - # for sample, prev in protocol(): - # print(f'sample-prev = {F.strprev(prev)}') + print(f'train-prev = {F.strprev(train_prev)}') + print(f'test-prev = {F.strprev(test_prev)}') - # prev = np.asarray([0.2, 0.3, 0.5]) - # prev = np.asarray([0.33, 0.33, 0.34]) - # prev = train_prev + repeats = 10 + prot = UPP(test, sample_size=SAMPLE_SIZE, repeats=repeats) + kde = KDEyMLauto(newLR()) + kde.fit(train) + AE_error, RAE_error, MSE_error, KLD_error, LIKE_value = [], [], [], [], [] + tr_posteriors, tr_y = kde.classif_predictions.Xy + for it, (sample, prev) in tqdm(enumerate(prot()), total=repeats): + te_posteriors = kde.classifier.predict_proba(sample) + classes = train.classes_ - # sample = test.sampling(SAMPLE_SIZE, *prev, random_state=1) - # print(f'sample-prev = {F.strprev(prev)}') + xaxis = [] + ae_error = [] + rae_error = [] + mse_error = [] + kld_error = [] + likelihood_value = [] - repeats = 10 - prot = UPP(test, sample_size=SAMPLE_SIZE, repeats=repeats) - kde = KDEyMLauto(newLR()) - kde.fit(train) - tr_posteriors, tr_y = kde.classif_predictions.Xy - for it, (sample, prev) in tqdm(enumerate(prot()), total=repeats): - te_posteriors = kde.classifier.predict_proba(sample) - classes = train.classes_ + # for bandwidth in np.linspace(0.01, 0.2, 50): + for bandwidth in np.logspace(-5, 0.5, 50): + mix_densities = kde.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + test_densities = [kde.pdf(kde_i, te_posteriors) for kde_i in mix_densities] - xaxis = [] - ae_error = [] - rae_error = [] - mse_error = [] - kld_error = [] - likelihood_val = [] + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) - # for bandwidth in np.linspace(0.01, 0.2, 50): - for bandwidth in np.logspace(-3, 0.5, 50): - mix_densities = kde.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) - test_densities = [kde.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + pred_prev, likelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) - def neg_loglikelihood_prev(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) - test_loglikelihood = np.log(test_mixture_likelihood + epsilon) - return -np.sum(test_loglikelihood) + xaxis.append(bandwidth) + ae_error.append(qp.error.ae(prev, pred_prev)) + rae_error.append(qp.error.rae(prev, pred_prev)) + mse_error.append(qp.error.mse(prev, pred_prev)) + kld_error.append(qp.error.kld(prev, pred_prev)) + likelihood_value.append(likelihood) - init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - pred_prev, likelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) + AE_error.append(ae_error) + RAE_error.append(rae_error) + MSE_error.append(mse_error) + KLD_error.append(kld_error) + LIKE_value.append(likelihood_value) - xaxis.append(bandwidth) - ae_error.append(qp.error.ae(prev, pred_prev)) - rae_error.append(qp.error.rae(prev, pred_prev)) - mse_error.append(qp.error.mse(prev, pred_prev)) - kld_error.append(qp.error.kld(prev, pred_prev)) - likelihood_val.append(likelihood) + return xaxis, AE_error, RAE_error, MSE_error, KLD_error, LIKE_value - import matplotlib.pyplot as plt + xaxis, AE_error, RAE_error, MSE_error, KLD_error, LIKE_value = qp.util.pickled_resource( + f'./plots/likelihood/pickles/{dataset}.pkl', generate_data) + + for row in range(len(AE_error)): # Crear la figura + # ---------------------------------------------------------------------------------------------------- fig, ax1 = plt.subplots(figsize=(8, 6)) # Pintar las series ae_error, rae_error, y kld_error en el primer eje Y - ax1.plot(xaxis, ae_error, label='AE Error', marker='o', color='b') - # ax1.plot(xaxis, rae_error, label='RAE Error', marker='s', color='g') - # ax1.plot(xaxis, kld_error, label='KLD Error', marker='^', color='r') - ax1.plot(xaxis, mse_error, label='MSE Error', marker='^', color='c') + if show_ae: + ax1.plot(xaxis, AE_error[row], label='AE', marker='o', color='b') + if show_rae: + ax1.plot(xaxis, RAE_error[row], label='RAE', marker='s', color='g') + if show_kld: + ax1.plot(xaxis, KLD_error[row], label='KLD', marker='^', color='r') + if show_mse: + ax1.plot(xaxis, MSE_error[row], label='MSE', marker='^', color='c') ax1.set_xscale('log') # Configurar etiquetas para el primer eje Y @@ -114,7 +128,7 @@ for i, dataset in enumerate(DATASETS): ax2 = ax1.twinx() # Pintar likelihood_val en el segundo eje Y - ax2.plot(xaxis, likelihood_val, label='(neg)Likelihood', marker='x', color='purple') + ax2.plot(xaxis, LIKE_value[row], label='(neg)Likelihood', marker='x', color='purple') # Configurar etiquetas para el segundo eje Y ax2.set_ylabel('Likelihood Value') @@ -124,9 +138,50 @@ for i, dataset in enumerate(DATASETS): plt.title('Error Metrics vs Bandwidth') # plt.show() os.makedirs('./plots/likelihood/', exist_ok=True) - plt.savefig(f'./plots/likelihood/{dataset}-fig{it}.png') + plt.savefig(f'./plots/likelihood/{dataset}-fig{row}.png') plt.close() + # Crear la figura con las medias + # ---------------------------------------------------------------------------------------------------- + fig, ax1 = plt.subplots(figsize=(8, 6)) + + def add_plot(ax, vals_error, name, color, marker, show): + if not show: + return + vals_error = np.asarray(vals_error) + vals_ave = np.mean(vals_error, axis=0) + vals_std = np.std(vals_error, axis=0) + ax.plot(xaxis, vals_ave, label=name, marker=marker, color=color) + ax.fill_between(xaxis, vals_ave - vals_std, vals_ave + vals_std, color=color, alpha=0.2) + + add_plot(ax1, AE_error, 'AE', color='b', marker='o', show=show_ae) + add_plot(ax1, RAE_error, 'RAE', color='g', marker='s', show=show_rae) + add_plot(ax1, KLD_error, 'KLD', color='r', marker='^', show=show_kld) + add_plot(ax1, MSE_error, 'MSE', color='c', marker='^', show=show_mse) + ax1.set_xscale('log') + + # Configurar etiquetas para el primer eje Y + ax1.set_xlabel('Bandwidth') + ax1.set_ylabel('Error Value') + ax1.grid(True) + ax1.legend(loc='upper left') + + # Crear un segundo eje Y que comparte el eje X + ax2 = ax1.twinx() + + # Pintar likelihood_val en el segundo eje Y + add_plot(ax2, LIKE_value, '(neg)Likelihood', color='purple', marker='x', show=True) + + # Configurar etiquetas para el segundo eje Y + ax2.set_ylabel('Likelihood Value') + ax2.legend(loc='upper right') + + # Mostrar el gráfico + plt.title('Error Metrics vs Bandwidth') + # plt.show() + os.makedirs('./plots/likelihood/', exist_ok=True) + plt.savefig(f'./plots/likelihood/{dataset}-figAve.png') + plt.close()