From 84f5799219a77d9dd8ba3d1cdd0f723db98c45d1 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 23 Sep 2024 22:23:41 +0200 Subject: [PATCH] another type of search for the bandwidth based in likelihood --- KDEy/kdey_devel.py | 155 ++++++++++++++++++------ KDEy/quantification_evaluation.py | 84 ++++++------- KDEy/quantification_evaluation_debug.py | 133 ++++++++++++++++++++ 3 files changed, 294 insertions(+), 78 deletions(-) create mode 100644 KDEy/quantification_evaluation_debug.py diff --git a/KDEy/kdey_devel.py b/KDEy/kdey_devel.py index 4cc93cb..0c8baae 100644 --- a/KDEy/kdey_devel.py +++ b/KDEy/kdey_devel.py @@ -12,7 +12,7 @@ from sklearn.metrics.pairwise import rbf_kernel from scipy import optimize - +epsilon = 1e-10 class KDEyMLauto(KDEyML): def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'): @@ -37,36 +37,45 @@ class KDEyMLauto(KDEyML): current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,)) current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - iterations = 0 - convergence = False - with qp.util.temp_seed(self.random_state): + if self.optim == 'max_likelihood': + current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True) + elif self.optim == 'max_likelihood2': + current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=False) + else: - while not convergence: - previous_bandwidth = current_bandwidth - previous_prevalence = current_prevalence + iterations = 0 + convergence = False + with qp.util.temp_seed(self.random_state): - iterations += 1 - print(f'{iterations}:') + while not convergence: + previous_bandwidth = current_bandwidth + previous_prevalence = current_prevalence - if self.optim == 'two_steps': - current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) - print(f'\testim-prev={F.strprev(current_prevalence)}') + iterations += 1 + print(f'{iterations}:') - current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) - print(f'\tbandwidth={current_bandwidth}') - if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all( - np.isclose(previous_prevalence, current_prevalence, atol=0.0001)): - convergence = True - elif self.optim == 'both': - current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) - if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)): - convergence = True - elif self.optim == 'both_fine': - current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, - te_posteriors, classes) + if self.optim == 'two_steps': + current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + print(f'\testim-prev={F.strprev(current_prevalence)}') + current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + print(f'\tbandwidth={current_bandwidth}') + elif self.optim == 'both': + current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + elif self.optim == 'both_fine': + current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + elif self.optim == 'both_fine': + current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + # elif self.optim == 'max_likelihood': + # current_prevalence, current_bandwidth = self.optim_minimize_like(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) - if all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)): - convergence = True + # check converngece + prev_convergence = all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)) + if isinstance(current_bandwidth, float): + band_convergence = np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) + else: + band_convergence = all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) + + convergence = band_convergence and prev_convergence self.bandwidth = current_bandwidth print('bandwidth=', current_bandwidth) @@ -74,7 +83,6 @@ class KDEyMLauto(KDEyML): return current_prevalence def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): - epsilon = 1e-10 mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth) test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] @@ -86,7 +94,6 @@ class KDEyMLauto(KDEyML): return optim_minimize(neg_loglikelihood_prev, current_prev) def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): - epsilon = 1e-10 def neg_loglikelihood_bandwidth(bandwidth): mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0]) test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] @@ -100,7 +107,6 @@ class KDEyMLauto(KDEyML): return r.x[0] def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): - epsilon = 1e-10 n_classes = len(current_prev) def neg_loglikelihood_bandwidth(prevalence_bandwidth): bandwidth = prevalence_bandwidth[-1] @@ -122,7 +128,6 @@ class KDEyMLauto(KDEyML): return current_prevalence, current_bandwidth def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): - epsilon = 1e-10 n_classes = len(current_bandwidth) def neg_loglikelihood_bandwidth(prevalence_bandwidth): prevalence = prevalence_bandwidth[:n_classes] @@ -143,6 +148,35 @@ class KDEyMLauto(KDEyML): current_bandwidth = prev_band[n_classes:] return current_prevalence, current_bandwidth + def optim_minimize_like(self, tr_posteriors, tr_y, te_posteriors, classes, reduction=100, grid=True): + n_classes = len(classes) + + # reduce samples to speed up computation + posteriors_subsample = LabelledCollection(tr_posteriors, tr_y) + posteriors_subsample = posteriors_subsample.sampling(reduction*n_classes) + n_test = te_posteriors.shape[0] + subsample_index = np.random.choice(np.arange(n_test), size=reduction) + te_posterior_subsample = te_posteriors[subsample_index] + + if grid: + _, best_band = self.choose_bandwidth_maxlikelihood_grid(*posteriors_subsample.Xy, te_posterior_subsample, classes) + else: + best_band = self.choose_bandwidth_maxlikelihood_search(*posteriors_subsample.Xy, te_posterior_subsample, classes) + + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, best_band) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) + + return pred_prev, best_band + + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): self.classif_predictions = classif_predictions return self @@ -150,8 +184,58 @@ class KDEyMLauto(KDEyML): def aggregate(self, posteriors: np.ndarray): return self.transduce(self.classif_predictions, posteriors) + def choose_bandwidth_maxlikelihood_grid(self, tr_posteriors, tr_y, te_posteriors, classes): + n_classes = len(classes) + best_band = None + best_like = None + best_prev = None + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + for bandwidth in np.logspace(-4, 0.5, 50): + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] -def optim_minimize(loss: Callable, init_prev: np.ndarray): + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) + + if best_like is None or neglikelihood < best_like: + best_like = neglikelihood + best_band = bandwidth + best_prev = pred_prev + + print(f'best-like={best_like:.4f}') + print(f'best-band={best_band:.4f}') + return best_prev, best_band + + def choose_bandwidth_maxlikelihood_search(self, tr_posteriors, tr_y, te_posteriors, classes): + n_classes = len(classes) + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + def neglikelihood_band(bandwidth): + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) + + return neglikelihood + + bounds = [(0.0001, 1)] + r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds) + + best_band = r.x[0] + print(f'solved in nit={r.nit}') + return best_band + + +def optim_minimize(loss: Callable, init_prev: np.ndarray, return_loss=False): """ Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's @@ -165,7 +249,10 @@ def optim_minimize(loss: Callable, init_prev: np.ndarray): # solutions are bounded to those contained in the unit-simplex bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints) - print(f'iterations-prevalence={r.nit}') - return r.x + r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints, tol=1e-10) + # print(f'iterations-prevalence={r.nit}') + if return_loss: + return r.x, r.fun + else: + return r.x diff --git a/KDEy/quantification_evaluation.py b/KDEy/quantification_evaluation.py index 6bc2c43..ecdde23 100644 --- a/KDEy/quantification_evaluation.py +++ b/KDEy/quantification_evaluation.py @@ -34,7 +34,7 @@ def wrap_hyper(classifier_hyper_grid: dict): METHODS = [ ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), - ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-3, 0.5, 50)}}), ] @@ -46,9 +46,11 @@ TKDEyML4 es como ML2 pero max 5 iteraciones por optimización """ TRANSDUCTIVE_METHODS = [ #('TKDEy-ML', KDEyMLauto(newLR()), None), - ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None), - ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None), - ('TKDEy-ML2', KDEyMLauto(newLR()), None), + # ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None), + # ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None), + # ('TKDEy-ML2', KDEyMLauto(newLR()), None), + ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None), + ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None), #('TKDEy-ML3', KDEyMLauto(newLR()), None), #('TKDEy-ML4', KDEyMLauto(newLR()), None), ] @@ -58,18 +60,17 @@ def show_results(result_path): df = pd.read_csv(result_path + '.csv', sep='\t') pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) - pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True) + pd.set_option('display.width', 1000) # Ajustar el ancho máximo + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True) + print(pv) + pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True) print(pv) - - -def load_timings(result_path): - import pandas as pd - timings = defaultdict(lambda: {}) - if not Path(result_path + '.csv').exists(): - return timings - - df = pd.read_csv(result_path + '.csv', sep='\t') - return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict() if __name__ == '__main__': @@ -83,20 +84,19 @@ if __name__ == '__main__': os.makedirs(result_dir, exist_ok=True) global_result_path = f'{result_dir}/allmethods' - timings = load_timings(global_result_path) with open(global_result_path + '.csv', 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n') + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n') for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS: print('Init method', method_name) with open(global_result_path + '.csv', 'at') as csv: - for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:4]: + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: print('init', dataset) - local_result_path = os.path.join(Path(global_result_path).parent, - method_name + '_' + dataset + '.dataframe') + # run_experiment(global_result_path, method_name, quantifier, param_grid, dataset) + local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe') if os.path.exists(local_result_path): print(f'result file {local_result_path} already exist; skipping') @@ -106,51 +106,47 @@ if __name__ == '__main__': with qp.util.temp_seed(SEED): data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True) + train, test = data.train_test - if not method_name.startswith("TKDEy-ML"): - # model selection - train, test = data.train_test + transductive_names = [name for (name, *_) in TRANSDUCTIVE_METHODS] + + if method_name not in transductive_names: + # model selection (train) train, val = train.split_stratified(random_state=SEED) - protocol = UPP(val, repeats=n_bags_val) modsel = GridSearchQ( quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' ) - t_init = time() try: modsel.fit(train) - print(f'best params {modsel.best_params_}') print(f'best score {modsel.best_score_}') - quantifier = modsel.best_model() except: print('something went wrong... trying to fit the default model') quantifier.fit(train) - timings[method_name][dataset] = time() - t_init + train_time = time() - t_init - protocol = UPP(test, repeats=n_bags_test) - report = qp.evaluation.evaluation_report( - quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True - ) - report.to_csv(local_result_path) else: - # model selection - train, test = data.train_test + # transductive t_init = time() - quantifier.fit(train) - timings[method_name][dataset] = time() - t_init + quantifier.fit(train) # <-- nothing actually (proyects the X into posteriors only) + train_time = time() - t_init - protocol = UPP(test, repeats=n_bags_test) - report = qp.evaluation.evaluation_report( - quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True - ) - report.to_csv(local_result_path) + # test + t_init = time() + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True + ) + test_time = time() - t_init + report['tr_time'] = train_time + report['te_time'] = test_time + report.to_csv(local_result_path) means = report.mean(numeric_only=True) - csv.write( - f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')#\t{timings[method_name][dataset]:.3f}\n') + csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n') csv.flush() show_results(global_result_path) \ No newline at end of file diff --git a/KDEy/quantification_evaluation_debug.py b/KDEy/quantification_evaluation_debug.py new file mode 100644 index 0000000..3b4e917 --- /dev/null +++ b/KDEy/quantification_evaluation_debug.py @@ -0,0 +1,133 @@ +import pickle +import os +from time import time +from collections import defaultdict +from tqdm import tqdm +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from KDEy.kdey_devel import KDEyMLauto, optim_minimize +from method._kdey import KDEBase +from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from pathlib import Path +from quapy import functional as F + +SEED = 1 + + +def newLR(): + return LogisticRegression(max_iter=1000)#, C=1, class_weight='balanced') + +SAMPLE_SIZE=150 +qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE + +epsilon = 1e-10 +# n_bags_test = 2 +DATASETS = [qp.datasets.UCI_MULTICLASS_DATASETS[21]] +# DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS +for i, dataset in enumerate(DATASETS): + data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + n_classes = data.n_classes + print(f'{i=}') + print(f'{dataset=}') + print(f'{n_classes=}') + print(len(data.training)) + print(len(data.test)) + + train, test = data.train_test + train_prev = train.prevalence() + test_prev = test.prevalence() + + print(f'train-prev = {F.strprev(train_prev)}') + print(f'test-prev = {F.strprev(test_prev)}') + + # protocol = UPP(test, repeats=n_bags_test) + # + # for sample, prev in protocol(): + # print(f'sample-prev = {F.strprev(prev)}') + + # prev = np.asarray([0.2, 0.3, 0.5]) + # prev = np.asarray([0.33, 0.33, 0.34]) + # prev = train_prev + + # sample = test.sampling(SAMPLE_SIZE, *prev, random_state=1) + # print(f'sample-prev = {F.strprev(prev)}') + + repeats = 10 + prot = UPP(test, sample_size=SAMPLE_SIZE, repeats=repeats) + kde = KDEyMLauto(newLR()) + kde.fit(train) + tr_posteriors, tr_y = kde.classif_predictions.Xy + for it, (sample, prev) in tqdm(enumerate(prot()), total=repeats): + te_posteriors = kde.classifier.predict_proba(sample) + classes = train.classes_ + + xaxis = [] + ae_error = [] + rae_error = [] + mse_error = [] + kld_error = [] + likelihood_val = [] + + # for bandwidth in np.linspace(0.01, 0.2, 50): + for bandwidth in np.logspace(-3, 0.5, 50): + mix_densities = kde.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + test_densities = [kde.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + pred_prev, likelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) + + xaxis.append(bandwidth) + ae_error.append(qp.error.ae(prev, pred_prev)) + rae_error.append(qp.error.rae(prev, pred_prev)) + mse_error.append(qp.error.mse(prev, pred_prev)) + kld_error.append(qp.error.kld(prev, pred_prev)) + likelihood_val.append(likelihood) + + import matplotlib.pyplot as plt + + # Crear la figura + fig, ax1 = plt.subplots(figsize=(8, 6)) + + # Pintar las series ae_error, rae_error, y kld_error en el primer eje Y + ax1.plot(xaxis, ae_error, label='AE Error', marker='o', color='b') + ax1.plot(xaxis, rae_error, label='RAE Error', marker='s', color='g') + ax1.plot(xaxis, kld_error, label='KLD Error', marker='^', color='r') + ax1.plot(xaxis, mse_error, label='MSE Error', marker='^', color='c') + ax1.set_xscale('log') + + # Configurar etiquetas para el primer eje Y + ax1.set_xlabel('Bandwidth') + ax1.set_ylabel('Error Value') + ax1.grid(True) + ax1.legend(loc='upper left') + + # Crear un segundo eje Y que comparte el eje X + ax2 = ax1.twinx() + + # Pintar likelihood_val en el segundo eje Y + ax2.plot(xaxis, likelihood_val, label='(neg)Likelihood', marker='x', color='purple') + + # Configurar etiquetas para el segundo eje Y + ax2.set_ylabel('Likelihood Value') + ax2.legend(loc='upper right') + + # Mostrar el gráfico + plt.title('Error Metrics vs Bandwidth') + # plt.show() + os.makedirs('./plots/likelihood/', exist_ok=True) + plt.savefig(f'./plots/likelihood/fig{it}.png') + plt.close() + + + + +