From 9fb208fe4cbd7e980fd7a01ab006b6d75b1d8cea Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 18 Sep 2024 10:33:58 +0200 Subject: [PATCH] switch --- KDEy/experiments.py | 136 +++++++++++++++++++++++--------------------- KDEy/utils.py | 13 +++-- 2 files changed, 80 insertions(+), 69 deletions(-) diff --git a/KDEy/experiments.py b/KDEy/experiments.py index b3c270e..524e7a3 100644 --- a/KDEy/experiments.py +++ b/KDEy/experiments.py @@ -1,16 +1,16 @@ import os import pickle - +import shutil import numpy as np from sklearn.linear_model import LogisticRegression from os.path import join import quapy as qp from quapy.protocol import UPP from kdey_devel import KDEyML +from utils import measuretime - -DEBUG = False +DEBUG = True qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500 val_repeats = 100 if DEBUG else 500 @@ -23,20 +23,24 @@ val_choice = {} bandwidth_range = np.linspace(0.01, 0.20, 20) if DEBUG: - bandwidth_range = np.linspace(0.01, 0.20, 10) + bandwidth_range = np.linspace(0.01, 0.20, 5) def datasets(): - for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS: + dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS + if DEBUG: + dataset_list = dataset_list[:4] + for dataset_name in dataset_list: dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name) if DEBUG: dataset = dataset.reduce(random_state=0) yield dataset -def predict_b_modsel(train): - tinit = 0 +@measuretime +def predict_b_modsel(dataset): # bandwidth chosen during model selection in validation + train = dataset.training train_tr, train_va = train.split_stratified(random_state=0) kdey = KDEyML(random_state=0) modsel = qp.model_selection.GridSearchQ( @@ -49,74 +53,73 @@ def predict_b_modsel(train): ).fit(train_tr) chosen_bandwidth = modsel.best_params_['bandwidth'] modsel_choice = float(chosen_bandwidth) - tend = + # kdey.set_params(bandwidth=chosen_bandwidth) + # kdey.fit(train) + # kdey.qua return modsel_choice -def experiment_dataset(dataset): + +def in_test_search(dataset, n_jobs=-1): train, test = dataset.train_test - test_gen = UPP(test, repeats=test_repeats) - # bandwidth chosen during model selection in validation - train_tr, train_va = train.split_stratified(random_state=0) - kdey = KDEyML(random_state=0) - modsel = qp.model_selection.GridSearchQ( - model=kdey, - param_grid={'bandwidth': bandwidth_range}, - protocol=UPP(train_va, repeats=val_repeats), - refit=False, - n_jobs=-1, - verbose=True - ).fit(train_tr) - chosen_bandwidth = modsel.best_params_['bandwidth'] - modsel_choice = float(chosen_bandwidth) - - # results in test print(f"testing KDEy in {dataset.name}") - dataset_results = [] - for b in bandwidth_range: - kdey = KDEyML(bandwidth=b, random_state=0) + + def experiment_job(bandwidth): + kdey = KDEyML(bandwidth=bandwidth, random_state=0) kdey.fit(train) - + test_gen = UPP(test, repeats=test_repeats) mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True) - print(f'bandwidth={b}: {mae:.5f}') - dataset_results.append((float(b), float(mae))) + print(f'{bandwidth=}: {mae:.5f}') + return float(mae) - return modsel_choice, dataset_results + dataset_results = qp.util.parallel(experiment_job, bandwidth_range, n_jobs=n_jobs) + return dataset_results, bandwidth_range -def plot_bandwidth(val_choice, test_results): - for dataset_name in val_choice.keys(): - import matplotlib.pyplot as plt - bandwidths, results = zip(*test_results[dataset_name]) +def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results): + import matplotlib.pyplot as plt - print(dataset_name) - print(bandwidths) - print(results) + print("PLOT", dataset_name) + print(dataset_name) - # Crear la gráfica - plt.figure(figsize=(8, 6)) + plt.figure(figsize=(8, 6)) - # Graficar los puntos de datos - plt.plot(bandwidths, results, marker='o') + # show test results + plt.plot(bandwidths, test_results, marker='o') - # Agregar la línea vertical en bandwidth_chosen - plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'bandwidth mod-sel: {val_choice[dataset_name]}') + for (method_name, method_choice, method_time) in triplet_list_results: + plt.axvline(x=method_choice, linestyle='--', label=method_name) - # Agregar etiquetas y título - plt.xlabel('Bandwidth') - plt.ylabel('MAE') - plt.title(dataset_name) + # Agregar etiquetas y título + plt.xlabel('Bandwidth') + plt.ylabel('MAE') + plt.title(dataset_name) - # Mostrar la leyenda - plt.legend() + # Mostrar la leyenda + plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) - # Mostrar la gráfica - plt.grid(True) - # plt.show() - os.makedirs('./plots', exist_ok=True) - plt.savefig(f'./plots/{dataset_name}.png') - plt.close() + # Mostrar la gráfica + plt.grid(True) + plotdir = './plots' + if DEBUG: + plotdir = './plots_debug' + os.makedirs(plotdir, exist_ok=True) + plt.tight_layout() + plt.savefig(f'{plotdir}/{dataset_name}.png') + plt.close() + +def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results): + best_bandwidth = bandwidth_range[np.argmin(test_results)] + print(f'Method\tChoice\tAE\tTime') + for method_name, method_choice, took in triplet_list_results: + if method_choice in bandwidth_range: + index = np.where(bandwidth_range == method_choice)[0][0] + method_score = test_results[index] + else: + method_score = 1 + error = np.abs(best_bandwidth-method_score) + print(f'{method_name}\t{method_choice}\t{error}\t{took:.3}s') for dataset in datasets(): @@ -124,20 +127,25 @@ for dataset in datasets(): print(len(dataset.training)) print(len(dataset.test)) + result_path = f'./results/{dataset.name}/' if DEBUG: - result_path = f'./results/debug/{dataset.name}.pkl' - else: - result_path = f'./results/{dataset.name}.pkl' + result_path = result_path.replace('results', 'results_debug') + if os.path.exists(result_path): + shutil.rmtree(result_path) - modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset) - val_choice[dataset.name] = modsel_choice - test_results[dataset.name] = dataset_results + dataset_results, bandwidth_range = qp.util.pickled_resource(join(result_path, 'test.pkl'), in_test_search, dataset) + + triplet_list_results = [] + modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset) + triplet_list_results.append(('modsel', modsel_choice, modsel_time,)) print(f'Dataset = {dataset.name}') print(modsel_choice) print(dataset_results) -plot_bandwidth(val_choice, test_results) + plot_bandwidth(dataset.name, dataset_results, bandwidth_range, triplet_list_results) + error_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results) + # time_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results) diff --git a/KDEy/utils.py b/KDEy/utils.py index 0e09f9a..c673378 100644 --- a/KDEy/utils.py +++ b/KDEy/utils.py @@ -4,9 +4,12 @@ from functools import wraps def measuretime(func): @wraps(func) def wrapper(*args, **kwargs): - start_time = time.time() # inicia el contador de tiempo - result = func(*args, **kwargs) # ejecuta la función original - end_time = time.time() # finaliza el contador de tiempo - time_it_took = end_time - start_time # calcula el tiempo total - return result, time_it_took # devuelve el resultado y el tiempo + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + time_it_took = end_time - start_time + if isinstance(result, tuple): + return (*result, time_it_took) + else: + return result, time_it_took return wrapper \ No newline at end of file