trying optimizing both prev and band at the same time, and per-class bandwidth

switch
2024-09-22 22:47:07 +02:00 · 2024-09-18 10:33:58 +02:00 · 2024-09-17 10:57:46 +02:00 · 2024-09-17 10:02:08 +02:00
10 changed files with 533 additions and 60 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "result_table"]
 	path = result_table
 	url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git
--- a/KDEy/constants.py
+++ b/KDEy/constants.py
@ -0,0 +1,2 @@
 DEBUG = False
--- a/KDEy/experiments.py
+++ b/KDEy/experiments.py
@ -1,13 +1,17 @@
 import os
-
+import pickle
 import shutil
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from os.path import join
 import quapy as qp
 from quapy.protocol import UPP
 from quapy.method.aggregative import KDEyML
 from quapy.protocol import UPP
 from kdey_devel import KDEyMLauto
 from utils import *
 from constants import *
 import quapy.functional as F
 DEBUG = False
 qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
 val_repeats  = 100 if DEBUG else 500
@ -20,21 +24,24 @@ val_choice = {}
 bandwidth_range = np.linspace(0.01, 0.20, 20)
 if DEBUG:
-    bandwidth_range = np.linspace(0.01, 0.20, 10)
+    bandwidth_range = np.linspace(0.01, 0.20, 5)
 def datasets():
-    for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS:
+    dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS[:4]
    if DEBUG:
        dataset_list = dataset_list[:4]
    for dataset_name in dataset_list:
        dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
        if DEBUG:
            dataset = dataset.reduce(random_state=0)
        yield dataset
-def experiment_dataset(dataset):
+@measuretime
-    train, test = dataset.train_test
+def predict_b_modsel(dataset):
    test_gen = UPP(test, repeats=test_repeats)
    # bandwidth chosen during model selection in validation
    train = dataset.training
    train_tr, train_va = train.split_stratified(random_state=0)
    kdey = KDEyML(random_state=0)
    modsel = qp.model_selection.GridSearchQ(
@ -47,66 +54,69 @@ def experiment_dataset(dataset):
    ).fit(train_tr)
    chosen_bandwidth = modsel.best_params_['bandwidth']
    modsel_choice = float(chosen_bandwidth)
    # kdey.set_params(bandwidth=chosen_bandwidth)
    # kdey.fit(train)
    # kdey.qua
    return modsel_choice
-    # results in test
+@measuretime
-    print(f"testing KDEy in {dataset.name}")
+def predict_b_kdeymlauto(dataset):
-    dataset_results = []
+    # bandwidth chosen during model selection in validation
-    for b in bandwidth_range:
+    train, test = dataset.train_test
-        kdey = KDEyML(bandwidth=b, random_state=0)
+    kdey = KDEyMLauto(random_state=0)
    print(f'true-prevalence: {F.strprev(test.prevalence())}')
    chosen_bandwidth, _ = kdey.chose_bandwidth(train, test.X)
    auto_bandwidth = float(chosen_bandwidth)
    return auto_bandwidth
 def in_test_search(dataset, n_jobs=-1):
    train, test = dataset.train_test
    print(f"generating true tests scores using KDEy in {dataset.name}")
    def experiment_job(bandwidth):
        kdey = KDEyML(bandwidth=bandwidth, random_state=0)
        kdey.fit(train)
-
+        test_gen = UPP(test, repeats=test_repeats)
        mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
-        print(f'bandwidth={b}: {mae:.5f}')
+        print(f'{bandwidth=}: {mae:.5f}')
-        dataset_results.append((float(b), float(mae)))
+        return float(mae)
-    return modsel_choice, dataset_results
+    dataset_results = qp.util.parallel(experiment_job, bandwidth_range, n_jobs=n_jobs)
    return dataset_results, bandwidth_range
 def plot_bandwidth(val_choice, test_results):
    for dataset_name in val_choice.keys():
        import matplotlib.pyplot as plt
        bandwidths, results = zip(*test_results[dataset_name])
        # Crear la gráfica
        plt.figure(figsize=(8, 6))
        # Graficar los puntos de datos
        plt.plot(bandwidths, results, marker='o')
        # Agregar la línea vertical en bandwidth_chosen
        plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'Bandwidth elegido: {val_choice[dataset_name]}')
        # Agregar etiquetas y título
        plt.xlabel('Bandwidth')
        plt.ylabel('Resultado')
        plt.title('Gráfica de Bandwidth vs Resultado')
        # Mostrar la leyenda
        plt.legend()
        # Mostrar la gráfica
        plt.grid(True)
        # plt.show()
        os.makedirs('./plots', exist_ok=True)
        plt.savefig(f'./plots/{dataset_name}.png')
 for dataset in datasets():
-    if DEBUG:
+    print('NAME', dataset.name)
-        result_path = f'./results/debug/{dataset.name}.pkl'
+    print(len(dataset.training))
-    else:
+    print(len(dataset.test))
        result_path = f'./results/{dataset.name}.pkl'
-    modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset)
+    result_path = f'./results/{dataset.name}/'
-    val_choice[dataset.name] = modsel_choice
+    if DEBUG:
-    test_results[dataset.name] = dataset_results
+        result_path = result_path.replace('results', 'results_debug')
        if os.path.exists(result_path):
            shutil.rmtree(result_path)
    dataset_results, bandwidth_range = qp.util.pickled_resource(join(result_path, 'test.pkl'), in_test_search, dataset)
    triplet_list_results = []
    modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset)
    triplet_list_results.append(('modsel', modsel_choice, modsel_time,))
    auto_choice, auto_time = qp.util.pickled_resource(join(result_path, 'auto.pkl'), predict_b_kdeymlauto, dataset)
    triplet_list_results.append(('auto', auto_choice, auto_time,))
    print(f'Dataset = {dataset.name}')
    print(modsel_choice)
    print(dataset_results)
-plot_bandwidth(val_choice, test_results)
+    plot_bandwidth(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
    error_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
    # time_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
--- a/KDEy/kdey_devel.py
+++ b/KDEy/kdey_devel.py
@ -0,0 +1,171 @@
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.neighbors import KernelDensity
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML
 import quapy.functional as F
 from sklearn.metrics.pairwise import rbf_kernel
 from scipy import optimize
 class KDEyMLauto(KDEyML):
    def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'):
        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.bandwidth = None
        self.random_state = random_state
        self.optim = optim
    def chose_bandwidth(self, train, test_instances):
        classif_predictions = self.classifier_fit_predict(train, fit_classifier=True, predict_on=self.val_split)
        te_posteriors = self.classify(test_instances)
        return self.transduce(classif_predictions, te_posteriors)
    def transduce(self, classif_predictions, te_posteriors):
        tr_posteriors, tr_y = classif_predictions.Xy
        classes = classif_predictions.classes_
        n_classes = len(classes)
        current_bandwidth = 0.05
        if self.optim == 'both_fine':
            current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
        current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        iterations = 0
        convergence = False
        with qp.util.temp_seed(self.random_state):
            while not convergence:
                previous_bandwidth = current_bandwidth
                previous_prevalence = current_prevalence
                iterations += 1
                print(f'{iterations}:')
                if self.optim == 'two_steps':
                    current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
                    print(f'\testim-prev={F.strprev(current_prevalence)}')
                    current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
                    print(f'\tbandwidth={current_bandwidth}')
                    if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(
                            np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
                        convergence = True
                elif self.optim == 'both':
                    current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
                    if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
                        convergence = True
                elif self.optim == 'both_fine':
                    current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y,
                                                         te_posteriors, classes)
                    if all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
                        convergence = True
        self.bandwidth = current_bandwidth
        print('bandwidth=', current_bandwidth)
        print('prevalence=', current_prevalence)
        return current_prevalence
    def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
        epsilon = 1e-10
        mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth)
        test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
        def neg_loglikelihood_prev(prev):
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return -np.sum(test_loglikelihood)
        return optim_minimize(neg_loglikelihood_prev, current_prev)
    def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
        epsilon = 1e-10
        def neg_loglikelihood_bandwidth(bandwidth):
            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0])
            test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(current_prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return -np.sum(test_loglikelihood)
        bounds = [(0.00001, 1)]
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
        print(f'iterations-bandwidth={r.nit}')
        return r.x[0]
    def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
        epsilon = 1e-10
        n_classes = len(current_prev)
        def neg_loglikelihood_bandwidth(prevalence_bandwidth):
            bandwidth = prevalence_bandwidth[-1]
            prevalence = prevalence_bandwidth[:-1]
            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
            test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return -np.sum(test_loglikelihood)
        bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1)]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
        prevalence_bandwidth = np.append(current_prev, current_bandwidth)
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both={r.nit}')
        prev_band = r.x
        current_prevalence = prev_band[:-1]
        current_bandwidth = prev_band[-1]
        return current_prevalence, current_bandwidth
    def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
        epsilon = 1e-10
        n_classes = len(current_bandwidth)
        def neg_loglikelihood_bandwidth(prevalence_bandwidth):
            prevalence = prevalence_bandwidth[:n_classes]
            bandwidth = prevalence_bandwidth[n_classes:]
            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
            test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return -np.sum(test_loglikelihood)
        bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
        prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both-fine={r.nit}')
        prev_band = r.x
        current_prevalence = prev_band[:n_classes]
        current_bandwidth = prev_band[n_classes:]
        return current_prevalence, current_bandwidth
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        self.classif_predictions = classif_predictions
        return self
    def aggregate(self, posteriors: np.ndarray):
        return self.transduce(self.classif_predictions, posteriors)
 def optim_minimize(loss: Callable, init_prev: np.ndarray):
    """
    Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
    that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
    SLSQP routine.
    :param loss: (callable) the function to minimize
    :return: (ndarray) the best prevalence vector found
    """
    n_classes = len(init_prev)
    # solutions are bounded to those contained in the unit-simplex
    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
    r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints)
    print(f'iterations-prevalence={r.nit}')
    return r.x
--- a/KDEy/quantification_evaluation.py
+++ b/KDEy/quantification_evaluation.py
@ -0,0 +1,156 @@
 import pickle
 import os
 from time import time
 from collections import defaultdict
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 from KDEy.kdey_devel import KDEyMLauto
 from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
 SEED = 1
 def newLR():
    return LogisticRegression(max_iter=3000)
 # typical hyperparameters explored for Logistic Regression
 logreg_grid = {
    'C': [1],
    'class_weight': [None]
 }
 def wrap_hyper(classifier_hyper_grid: dict):
    return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
 METHODS = [
    ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
    ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
    ('KDEy-ML',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
 ]
 """
 TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
 TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
 TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
 TKDEyML4 es como ML2 pero max 5 iteraciones por optimización 
 """
 TRANSDUCTIVE_METHODS = [
    #('TKDEy-ML',  KDEyMLauto(newLR()), None),
    ('TKDEy-MLboth',  KDEyMLauto(newLR(), optim='both'), None),
    ('TKDEy-MLbothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
    ('TKDEy-ML2',  KDEyMLauto(newLR()), None),
    #('TKDEy-ML3',  KDEyMLauto(newLR()), None),
    #('TKDEy-ML4',  KDEyMLauto(newLR()), None),
 ]
 def show_results(result_path):
    import pandas as pd
    df = pd.read_csv(result_path + '.csv', sep='\t')
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
    print(pv)
 def load_timings(result_path):
    import pandas as pd
    timings = defaultdict(lambda: {})
    if not Path(result_path + '.csv').exists():
        return timings
    df = pd.read_csv(result_path + '.csv', sep='\t')
    return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
    qp.environ['N_JOBS'] = -1
    n_bags_val = 25
    n_bags_test = 100
    result_dir = f'results_quantification/ucimulti'
    os.makedirs(result_dir, exist_ok=True)
    global_result_path = f'{result_dir}/allmethods'
    timings = load_timings(global_result_path)
    with open(global_result_path + '.csv', 'wt') as csv:
        csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n')
    for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS:
        print('Init method', method_name)
        with open(global_result_path + '.csv', 'at') as csv:
            for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:4]:
                print('init', dataset)
                local_result_path = os.path.join(Path(global_result_path).parent,
                                                 method_name + '_' + dataset + '.dataframe')
                if os.path.exists(local_result_path):
                    print(f'result file {local_result_path} already exist; skipping')
                    report = qp.util.load_report(local_result_path)
                else:
                    with qp.util.temp_seed(SEED):
                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
                        if not method_name.startswith("TKDEy-ML"):
                            # model selection
                            train, test = data.train_test
                            train, val = train.split_stratified(random_state=SEED)
                            protocol = UPP(val, repeats=n_bags_val)
                            modsel = GridSearchQ(
                                quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
                            )
                            t_init = time()
                            try:
                                modsel.fit(train)
                                print(f'best params {modsel.best_params_}')
                                print(f'best score {modsel.best_score_}')
                                quantifier = modsel.best_model()
                            except:
                                print('something went wrong... trying to fit the default model')
                                quantifier.fit(train)
                            timings[method_name][dataset] = time() - t_init
                            protocol = UPP(test, repeats=n_bags_test)
                            report = qp.evaluation.evaluation_report(
                                quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
                            )
                            report.to_csv(local_result_path)
                        else:
                            # model selection
                            train, test = data.train_test
                            t_init = time()
                            quantifier.fit(train)
                            timings[method_name][dataset] = time() - t_init
                            protocol = UPP(test, repeats=n_bags_test)
                            report = qp.evaluation.evaluation_report(
                                quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
                            )
                            report.to_csv(local_result_path)
                means = report.mean(numeric_only=True)
                csv.write(
                    f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')#\t{timings[method_name][dataset]:.3f}\n')
                csv.flush()
    show_results(global_result_path)
--- a/KDEy/utils.py
+++ b/KDEy/utils.py
@ -0,0 +1,81 @@
 import time
 from functools import wraps
 import os
 from os.path import join
 from result_table.src.table import Table
 import numpy as np
 from constants import *
 def measuretime(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        time_it_took = end_time - start_time
        if isinstance(result, tuple):
            return (*result, time_it_took)
        else:
            return result, time_it_took
    return wrapper
 def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
    import matplotlib.pyplot as plt
    print("PLOT", dataset_name)
    print(dataset_name)
    plt.figure(figsize=(8, 6))
    # show test results
    plt.plot(bandwidths, test_results, marker='o', color='k')
    colors = plt.cm.tab10(np.linspace(0, 1, len(triplet_list_results)))
    for i, (method_name, method_choice, method_time) in enumerate(triplet_list_results):
        plt.axvline(x=method_choice, linestyle='--', label=method_name, color=colors[i])
    # Agregar etiquetas y título
    plt.xlabel('Bandwidth')
    plt.ylabel('MAE')
    plt.title(dataset_name)
    # Mostrar la leyenda
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    # Mostrar la gráfica
    plt.grid(True)
    plotdir = './plots'
    if DEBUG:
        plotdir = './plots_debug'
    os.makedirs(plotdir, exist_ok=True)
    plt.tight_layout()
    plt.savefig(f'{plotdir}/{dataset_name}.png')
    plt.close()
 def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
    best_bandwidth = bandwidth_range[np.argmin(test_results)]
    best_score = np.min(test_results)
    print(f'Method\tChoice\tAE\tTime')
    table=Table(name=dataset_name)
    table.format.with_mean=False
    table.format.with_rank_mean = False
    table.format.show_std = False
    for method_name, method_choice, took in triplet_list_results:
        if method_choice in bandwidth_range:
            index = np.where(bandwidth_range == method_choice)[0][0]
            method_score = test_results[index]
        else:
            method_score = 1
        error = np.abs(best_score-method_score)
        table.add(benchmark='Choice', method=method_name, v=method_choice)
        table.add(benchmark='ScoreChoice', method=method_name, v=method_score)
        table.add(benchmark='Best', method=method_name, v=best_bandwidth)
        table.add(benchmark='ScoreBest', method=method_name, v=best_score)
        table.add(benchmark='AE', method=method_name, v=error)
        table.add(benchmark='Time', method=method_name, v=took)
    outpath = './tables'
    if DEBUG:
        outpath = './tables_debug'
    table.latexPDF(join(outpath, dataset_name+'.pdf'), transpose=True)
--- a/quapy/init.py
+++ b/quapy/init.py
@ -14,7 +14,7 @@ from . import model_selection
 from . import classification
 import os
-__version__ = '0.1.9'
+__version__ = '0.1.10'
 environ = {
    'SAMPLE_SIZE': None,
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -3,6 +3,7 @@ from contextlib import contextmanager
 import zipfile
 from os.path import join
 import pandas as pd
 import sklearn.datasets
 from ucimlrepo import fetch_ucirepo
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
@ -1004,3 +1005,49 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
        return train, test_gen
    else:
        return train_gen, test_gen
 def syntheticUniformLabelledCollection(n_samples, n_features, n_classes, n_clusters_per_class=1, **kwargs):
    """
    Generates a synthetic labelled collection with uniform priors and
    of `n_samples` instances, `n_features` features, and `n_classes` classes.
    The underlying generator relies on the function
    `sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
    see the `scikit-learn documentation
    <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
    for a full list of optional parameters.
    :param n_samples: number of instances
    :param n_features: number of features
    :param n_classes: number of classes
    """
    X, y = sklearn.datasets.make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_classes=n_classes,
        n_clusters_per_class=n_clusters_per_class,
        **kwargs
    )
    return LabelledCollection(X, y)
 def syntheticUniformDataset(n_samples, n_features, n_classes, test_split=0.3, **kwargs):
    """
    Generates a synthetic Dataset with approximately uniform priors and
    of `n_samples` instances, `n_features` features, and `n_classes` classes.
    The underlying generator relies on the function
    `sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
    see the `scikit-learn documentation
    <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
    for a full list of optional parameters.
    :param n_samples: number of instances
    :param n_features: number of features
    :param n_classes: number of classes
    :param test_split: proportion of test instances
    """
    assert 0. < test_split < 1., "invalid proportion of test instances; the value must be in (0, 1)"
    lc = syntheticUniformLabelledCollection(n_samples, n_features, n_classes, **kwargs)
    training, test = lc.split_stratified(train_prop=1-test_split, random_state=kwargs.get('random_state', None))
    dataset = Dataset(training=training, test=test, name=f'synthetic(nF={n_features},nC={n_classes})')
    return dataset
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -66,11 +66,13 @@ class KDEBase:
        """
        class_cond_X = []
        for cat in classes:
-            selX = X[y==cat]
+            selX = X[y == cat]
-            if selX.size==0:
+            if selX.size == 0:
                selX = [F.uniform_prevalence(len(classes))]
            class_cond_X.append(selX)
-        return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
+        if isinstance(bandwidth, float):
            bandwidth = np.full(fill_value=bandwidth, shape=(len(classes),))
        return [self.get_kde_function(X_cond_yi, band_i) for X_cond_yi, band_i in zip(class_cond_X, bandwidth)]
 class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -188,7 +190,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
    def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
                 bandwidth=0.1, random_state=None, montecarlo_trials=10000):
-        
+
        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.divergence = divergence
@ -218,7 +220,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
        def f_squared_hellinger(u):
            return (np.sqrt(u)-1)**2
-        
+
        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
@ -283,7 +285,7 @@ class KDEyCS(AggregativeSoftQuantifier):
    def gram_matrix_mix_sum(self, X, Y=None):
        # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
-        # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
+        # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
        # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
        h = self.bandwidth
        variance = 2 * (h**2)
@ -342,7 +344,7 @@ class KDEyCS(AggregativeSoftQuantifier):
        # at each iteration of the optimization phase)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
-            tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) 
+            tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
        def divergence(alpha):
            # called \overline{r} in the paper
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	5f9dad4644	trying optimizing both prev and band at the same time, and per-class bandwidth	2024-09-22 22:47:07 +02:00
Alejandro Moreo Fernandez	9fb208fe4c	switch	2024-09-18 10:33:58 +02:00
Alejandro Moreo Fernandez	6ce5eea4f2	switch	2024-09-17 10:57:46 +02:00
Alejandro Moreo Fernandez	f30c6ceaa1	switch	2024-09-17 10:02:08 +02:00
		`@ -0,0 +1 @@`
							`Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4`