KDE with closed form working

understanding montecarlo sampling
2023-10-13 17:34:26 +02:00 · 2023-10-02 17:50:12 +02:00
18 changed files with 879 additions and 198 deletions
--- a/distribution_matching/binary_experiments.py
+++ b/distribution_matching/binary_experiments.py
@ -16,68 +16,68 @@ if __name__ == '__main__':
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
-    optim = 'mae'
-    result_dir = f'results/binary/{optim}'
+    for optim in ['mae', 'mrae']:
+        result_dir = f'results/binary/{optim}'

-    os.makedirs(result_dir, exist_ok=True)
+        os.makedirs(result_dir, exist_ok=True)

-    for method in BIN_METHODS:
+        for method in BIN_METHODS:

-        print('Init method', method)
+            print('Init method', method)

-        global_result_path = f'{result_dir}/{method}'
+            global_result_path = f'{result_dir}/{method}'

-        if not os.path.exists(global_result_path + '.csv'):
-            with open(global_result_path + '.csv', 'wt') as csv:
-                csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
+            if not os.path.exists(global_result_path + '.csv'):
+                with open(global_result_path + '.csv', 'wt') as csv:
+                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')

-        with open(global_result_path + '.csv', 'at') as csv:
+            with open(global_result_path + '.csv', 'at') as csv:

-            for dataset in qp.datasets.UCI_DATASETS:
-                if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
+                for dataset in qp.datasets.UCI_DATASETS:
+                    if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue

-                print('init', dataset)
+                    print('init', dataset)

-                local_result_path = global_result_path + '_' + dataset
-                if os.path.exists(local_result_path + '.dataframe'):
-                    print(f'result file {local_result_path}.dataframe already exist; skipping')
-                    continue
+                    local_result_path = global_result_path + '_' + dataset
+                    if os.path.exists(local_result_path + '.dataframe'):
+                        print(f'result file {local_result_path}.dataframe already exist; skipping')
+                        continue

-                with qp.util.temp_seed(SEED):
+                    with qp.util.temp_seed(SEED):

-                    param_grid, quantifier = new_method(method, max_iter=3000)
+                        param_grid, quantifier = new_method(method, max_iter=3000)

-                    data = qp.datasets.fetch_UCIDataset(dataset)
+                        data = qp.datasets.fetch_UCIDataset(dataset)

-                    # model selection
-                    train, test = data.train_test
-                    train, val = train.split_stratified()
+                        # model selection
+                        train, test = data.train_test
+                        train, val = train.split_stratified(random_state=SEED)

-                    protocol = UPP(val, repeats=n_bags_val)
-                    modsel = GridSearchQ(
-                        quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
-                    )
+                        protocol = UPP(val, repeats=n_bags_val)
+                        modsel = GridSearchQ(
+                            quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
+                        )

-                    try:
-                        modsel.fit(train)
+                        try:
+                            modsel.fit(train)

-                        print(f'best params {modsel.best_params_}')
-                        print(f'best score {modsel.best_score_}')
-                        pickle.dump(
-                            (modsel.best_params_, modsel.best_score_,),
-                            open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+                            print(f'best params {modsel.best_params_}')
+                            print(f'best score {modsel.best_score_}')
+                            pickle.dump(
+                                (modsel.best_params_, modsel.best_score_,),
+                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

-                        quantifier = modsel.best_model()
-                    except:
-                        print('something went wrong... reporting CC')
-                        quantifier = qp.method.aggregative.CC(LR()).fit(train)
+                            quantifier = modsel.best_model()
+                        except:
+                            print('something went wrong... reporting CC')
+                            quantifier = qp.method.aggregative.CC(LR()).fit(train)

-                    protocol = UPP(test, repeats=n_bags_test)
-                    report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
-                                                             verbose=True)
-                    report.to_csv(f'{local_result_path}.dataframe')
-                    means = report.mean()
-                    csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
-                    csv.flush()
+                        protocol = UPP(test, repeats=n_bags_test)
+                        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
+                                                                verbose=True)
+                        report.to_csv(f'{local_result_path}.dataframe')
+                        means = report.mean()
+                        csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
+                        csv.flush()

-    show_results(global_result_path)
+        show_results(global_result_path)
--- a/distribution_matching/commons.py
+++ b/distribution_matching/commons.py
@ -1,19 +1,21 @@
 import numpy as np
 import pandas as pd
 from distribution_matching.method_kdey import KDEy
+from distribution_matching.method_kdey_closed import KDEyclosed
+from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
 from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
 from distribution_matching.method_dirichlety import DIRy
 from sklearn.linear_model import LogisticRegression
+from method_kdey_closed_efficient import KDEyclosed_efficient

-
-METHODS  = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML']
-BIN_METHODS = ['ACC', 'PACC', 'HDy', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML']  
+METHODS  = ['KDEy-closed++', 'KDEy-closed+', 'KDEy-closed', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
+BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]


 hyper_LR = {
    'classifier__C': np.logspace(-3,3,7),
    'classifier__class_weight': ['balanced', None]
-} 
+}

 def new_method(method, **lr_kwargs):

@ -32,9 +34,21 @@ def new_method(method, **lr_kwargs):
        param_grid = hyper_LR
        quantifier = PACC(lr)
    elif method == 'KDEy-ML':
-        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='max_likelihood', val_split=10)
+    elif method == 'KDEy-closed':
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEyclosed(lr, val_split=10)
+    elif method == 'KDEy-closed+':
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEyclosed_efficient(lr, val_split=10)
+    elif method == 'KDEy-closed++':
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEyclosed_efficient_corr(lr, val_split=10)
    elif method in ['KDEy-DM']:
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
@ -57,10 +71,40 @@ def new_method(method, **lr_kwargs):
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)

-    elif method in ['KDE-DMkld']:
+    # experimental
+    elif method in ['KDEy-DMkld']:
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
+    # elif method in ['KDEy-DMhd']:
+    #     The code to reproduce this run is commented in the min_divergence target, I think it was incorrect...
+    #     method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+    #     param_grid = {**method_params, **hyper_LR}
+    #     quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
+    elif method in ['KDEy-DMhd2']:
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEy(lr, target='min_divergence_uniform', divergence='HD', montecarlo_trials=5000, val_split=10)
+    elif method in ['KDEy-DMjs']:
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10)
+    elif method in ['KDEy-DMhd3']:
+        # I have realized that there was an error. I am sampling from the validation distribution (V) and not from the
+        # test distribution (T) just because the validation can be sampled in fit only once and pre-computed densities
+        # can be stored. This means that the reference distribution is V and not T. Then I have found that an
+        # f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
+        # V then I am computing D(T||V) (and not D(V||T) as I thought).
+        method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
+    elif method == 'DM-HD':
+        method_params = {
+            'nbins': [4,8,16,32],
+            'val_split': [10, 0.4],
+        }
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = DistributionMatching(lr, divergence='HD')

    else:
        raise NotImplementedError('unknown method', method)
--- a/distribution_matching/figures/nclasses_plot.py
+++ b/distribution_matching/figures/nclasses_plot.py
@ -0,0 +1,29 @@
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+"""
+This script generates plots of sensibility for the number of classes
+Plots results for MAE, MRAE, and KLD
+The hyperparameters were set as: 
+    quantifier.set_params(classifier__C=0.01, classifier__class_weight='balanced', bandwidth=0.2)
+"""
+
+methods = ['DM', 'KDEy-ML', 'EMQ']
+optim = 'mae'
+dfs = [pd.read_csv(f'../results/lequa/nclasses/{optim}/{method}.csv', sep='\t') for method in methods]
+df = pd.concat(dfs)
+
+
+for err in ['MAE', 'MRAE']:
+    piv = df.pivot_table(index='nClasses', columns='Method', values=err)
+    g = sns.lineplot(data=piv, markers=True, dashes=False)
+    g.set(xlim=(1, 28))
+    g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
+    g.set_ylabel(err)
+    g.set_xticks(np.linspace(1, 28, 28))
+    plt.xticks(rotation=90)
+    plt.grid()
+    plt.savefig(f'./nclasses_{err}.pdf', bbox_inches='tight')
+    plt.clf()
--- a/distribution_matching/figures/sensibility_plot.py
+++ b/distribution_matching/figures/sensibility_plot.py
@ -9,8 +9,8 @@ Plots results for MAE, MRAE, and KLD
 The rest of hyperparameters were set to their default values
 """

-df_tweet = pd.read_csv('../results_tweet_sensibility/KDEy-MLE.csv', sep='\t')
-df_lequa = pd.read_csv('../results_lequa_sensibility/KDEy-MLE.csv', sep='\t')
+df_tweet = pd.read_csv('../results/tweet/sensibility/KDEy-ML.csv', sep='\t')
+df_lequa = pd.read_csv('../results/lequa/sensibility/KDEy-ML.csv', sep='\t')
 df = pd.concat([df_tweet, df_lequa])

 for err in ['MAE', 'MRAE', 'KLD']:
--- a/distribution_matching/lequa_experiments.py
+++ b/distribution_matching/lequa_experiments.py
@ -13,48 +13,53 @@ if __name__ == '__main__':

    qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
    qp.environ['N_JOBS'] = -1
-    optim = 'mrae'
-    result_dir = f'results/lequa/{optim}'
+    for optim in ['mae', 'mrae']:

-    os.makedirs(result_dir, exist_ok=True)
+        result_dir = f'results/lequa/{optim}'

-    for method in METHODS:
-        
-        print('Init method', method)
+        os.makedirs(result_dir, exist_ok=True)

-        result_path = f'{result_dir}/{method}'
-        
-        if os.path.exists(result_path+'.csv'):
-            print(f'file {result_path}.csv already exist; skipping')
-            continue
+        for method in METHODS:

-        with open(result_path+'.csv', 'wt') as csv:
-            csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')    
+            print('Init method', method)

-            dataset = 'T1B'
-            train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
-            print(f'init {dataset} #instances: {len(train)}')
-            param_grid, quantifier = new_method(method)
+            result_path = f'{result_dir}/{method}'

-            if param_grid is not None:
-                modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
+            if os.path.exists(result_path+'.csv'):
+                print(f'file {result_path}.csv already exist; skipping')
+                continue

-                modsel.fit(train)
-                print(f'best params {modsel.best_params_}')
-                print(f'best score {modsel.best_score_}')
-                pickle.dump(
-                    (modsel.best_params_, modsel.best_score_,), 
-                    open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+            with open(result_path+'.csv', 'wt') as csv:
+                csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')

-                quantifier = modsel.best_model()
-            else:
-                print('debug mode... skipping model selection')
-                quantifier.fit(train)
+                dataset = 'T1B'
+                train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
+                print(f'init {dataset} #instances: {len(train)}')
+                param_grid, quantifier = new_method(method)

-            report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
-            means = report.mean()
-            report.to_csv(result_path+'.dataframe')
-            csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
-            csv.flush()
+                if param_grid is not None:
+                    modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)

-    show_results(result_path)
+                    modsel.fit(train)
+                    print(f'best params {modsel.best_params_}')
+                    print(f'best score {modsel.best_score_}')
+                    pickle.dump(
+                        (modsel.best_params_, modsel.best_score_,),
+                        open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+
+                    quantifier = modsel.best_model()
+                else:
+                    print('debug mode... skipping model selection')
+                    quantifier.fit(train)
+
+                report = qp.evaluation.evaluation_report(
+                    quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'],
+                    verbose=True, verbose_error=optim[1:], n_jobs=-1
+                )
+                means = report.mean()
+                report.to_csv(result_path+'.dataframe')
+                csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
+                csv.flush()
+                print(means)
+
+        show_results(result_path)
--- a/distribution_matching/lequa_nclasses_sensibility.py
+++ b/distribution_matching/lequa_nclasses_sensibility.py
@ -0,0 +1,74 @@
+import pickle
+import numpy as np
+import os
+from os.path import join
+import pandas as pd
+from quapy.protocol import UPP
+from quapy.data import LabelledCollection
+from distribution_matching.commons import METHODS, new_method, show_results
+import quapy as qp
+
+
+SEED=1
+
+
+def extract_classes(data:LabelledCollection, classes):
+    X, y = data.Xy
+    counts = data.counts()
+    Xs, ys = [], []
+    for class_i in classes:
+        Xs.append(X[y==class_i])
+        ys.append([class_i]*counts[class_i])
+    Xs = np.concatenate(Xs)
+    ys = np.concatenate(ys)
+    return LabelledCollection(Xs, ys, classes=classes
+                              )
+
+def task(nclasses):
+    in_classes = np.arange(0, nclasses)
+    train = extract_classes(train_pool, classes=in_classes)
+    test = extract_classes(test_pool, classes=in_classes)
+    with qp.util.temp_seed(SEED):
+        hyper, quantifier = new_method(method)
+        quantifier.set_params(classifier__C=1, classifier__class_weight='balanced')
+        hyper = {h:v for h,v in hyper.items() if not h.startswith('classifier__')}
+        tr, va = train.split_stratified(random_state=SEED)
+        quantifier = qp.model_selection.GridSearchQ(quantifier, hyper, UPP(va), optim).fit(tr)
+        report = qp.evaluation.evaluation_report(quantifier, protocol=UPP(test), error_metrics=['mae', 'mrae', 'kld'], verbose=True)
+        return report
+
+
+# only the quantifier-dependent hyperparameters are explored; the classifier is a LR with default parameters
+if __name__ == '__main__':
+
+    qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
+    qp.environ['N_JOBS'] = -1
+
+
+    for optim in ['mae']: #, 'mrae']:
+
+        result_dir = f'results/lequa/nclasses/{optim}'
+        os.makedirs(result_dir, exist_ok=True)
+
+        for method in ['DM', 'EMQ', 'KDEy-ML']: # 'KDEy-ML', 'KDEy-DMhd3']:
+
+            result_path = join(result_dir, f'{method}.csv')
+            if os.path.exists(result_path): continue
+
+            train_orig, _, _ = qp.datasets.fetch_lequa2022('T1B')
+
+            train_pool, test_pool = train_orig.split_stratified(0.5, random_state=SEED)
+            arange_classes = np.arange(2, train_orig.n_classes + 1)
+            reports = qp.util.parallel(task, arange_classes, n_jobs=-1)
+            with open(result_path, 'at') as csv:
+                csv.write(f'Method\tDataset\tnClasses\tMAE\tMRAE\tKLD\n')
+                for num_classes, report in zip(arange_classes, reports):
+                    means = report.mean()
+                    report_result_path = join(result_dir, f'{method}_{num_classes}')+'.dataframe'
+                    report.to_csv(report_result_path)
+                    csv.write(f'{method}\tLeQua-T1B\t{num_classes}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
+                    csv.flush()
+
+            means = report.mean()
+            print(means)
+
--- a/distribution_matching/method_kdey.py
+++ b/distribution_matching/method_kdey.py
@ -27,7 +27,7 @@ class KDEy(AggregativeProbabilisticQuantifier):

    BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
    ENGINE = ['scipy', 'sklearn', 'statsmodels']
-    TARGET = ['min_divergence', 'max_likelihood']
+    TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood']

    def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
                 bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
@ -35,7 +35,8 @@ class KDEy(AggregativeProbabilisticQuantifier):
            f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
        assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
        assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
-        assert divergence=='KLD', 'in this version I will only allow KLD as a divergence'
+        assert target=='max_likelihood' or divergence in ['KLD', 'HD', 'JS'], \
+            'in this version I will only allow KLD or squared HD as a divergence'
        self.classifier = classifier
        self.val_split = val_split
        self.divergence = divergence
@ -118,7 +119,6 @@ class KDEy(AggregativeProbabilisticQuantifier):
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
-        print('classifier fit done')

        if self.bandwidth == 'auto':
            self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
@ -126,21 +126,22 @@ class KDEy(AggregativeProbabilisticQuantifier):
        self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
        self.val_posteriors = posteriors

-        if self.target == 'min_divergence_depr':
+        if self.target == 'min_divergence_uniform':
            self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
            self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
-        if self.target == 'min_divergence':
+        elif self.target == 'min_divergence':
            self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
            self.class_sample_densities = {}
            for ci, samples_i in enumerate(self.class_samples):
                self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T

-        print('kde fit done')
        return self

    def aggregate(self, posteriors: np.ndarray):
        if self.target == 'min_divergence':
            return self._target_divergence(posteriors)
+        elif self.target == 'min_divergence_uniform':
+            return self._target_divergence_uniform(posteriors)
        elif self.target == 'max_likelihood':
            return self._target_likelihood(posteriors)
        else:
@ -170,6 +171,42 @@ class KDEy(AggregativeProbabilisticQuantifier):
    #     r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
    #     return r.x

+    def _target_divergence_uniform(self, posteriors):
+        # in this variant we evaluate the divergence using a Montecarlo approach
+        n_classes = len(self.val_densities)
+
+        test_kde = self.get_kde_function(posteriors)
+        test_likelihood = self.pdf(test_kde, self.samples)
+
+        def f_squared_hellinger(t):
+            return (np.sqrt(t) - 1)**2
+
+        def f_jensen_shannon(t):
+            return -(t+1)*np.log((t+1)/2) + t*np.log(t)
+
+        def fdivergence(pi, qi, f, eps=1e-10):
+            spi = pi+eps
+            sqi = qi+eps
+            return np.mean(f(spi/sqi)*sqi)
+
+        if self.divergence.lower() == 'hd':
+            f = f_squared_hellinger
+        elif self.divergence.lower() == 'js':
+            f = f_jensen_shannon
+
+        def match(prev):
+            val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
+            return fdivergence(val_likelihood, test_likelihood, f)
+
+        # the initial point is set as the uniform distribution
+        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+
+        # solutions are bounded to those contained in the unit-simplex
+        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
+        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
+        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+        return r.x
+
    def _target_divergence(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)
@ -184,6 +221,18 @@ class KDEy(AggregativeProbabilisticQuantifier):
            smooth_qi = qi+eps
            return np.mean(np.log(smooth_pi / smooth_qi))

+        def squared_hellinger(pi, qi, eps=1e-8):
+            smooth_pi = pi + eps
+            smooth_qi = qi + eps
+            return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2)
+
+        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
+        if self.divergence.lower() == 'kld':
+            fdivergence = kld_monte
+        elif self.divergence.lower() == 'hd':
+            fdivergence = squared_hellinger
+
+
        def match(prev):
            # choose the samples according to the prevalence vector
            # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
@ -202,7 +251,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
            test_likelihood = np.concatenate(
                [samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
            )
-            return kld_monte(val_likelihood, test_likelihood)
+            # return fdivergence(val_likelihood, test_likelihood)  # this is wrong, If I sample from the val distribution
+            # then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
+            return fdivergence(test_likelihood, val_likelihood)

        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
@ -246,4 +297,5 @@ class KDEy(AggregativeProbabilisticQuantifier):
        #print('searching for alpha')
        r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        #print('[optimization ended]')
-        return r.x
+        return r.x
+
--- a/distribution_matching/method_kdey_closed.py
+++ b/distribution_matching/method_kdey_closed.py
@ -0,0 +1,174 @@
+from cgi import test
+import os
+import sys
+from typing import Union, Callable
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LogisticRegression
+import pandas as pd
+from sklearn.model_selection import GridSearchCV
+from sklearn.neighbors import KernelDensity
+from scipy.stats import multivariate_normal
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, UPP
+from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
+    DistributionMatching, _get_divergence
+import scipy
+from scipy import optimize
+from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
+from time import time
+from sklearn.metrics.pairwise import rbf_kernel
+
+
+def gram_matrix_mix(bandwidth, X, Y=None):
+    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
+    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
+    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
+    variance = 2 * (bandwidth**2)
+    nD = X.shape[1]
+    gamma = 1/(2*variance)
+    gram = rbf_kernel(X, Y, gamma=gamma)
+    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
+    gram *= norm_factor
+    print('GRAM SUM:', gram.sum())
+    return gram
+
+def weighted_prod(pi, tau, G):
+    return pi[:,np.newaxis] * G * tau
+
+def tril_weighted_prod(pi, G):
+    M = weighted_prod(pi, pi, G)
+    return np.triu(M,1)
+
+
+class KDEyclosed(AggregativeProbabilisticQuantifier):
+
+    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
+        self.classifier = classifier
+        self.val_split = val_split
+        self.bandwidth = bandwidth
+        self.n_jobs = n_jobs
+        self.random_state=random_state
+
+    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
+        """
+
+        :param data: the training set
+        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
+        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
+         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
+        """
+        # print('[fit] enter')
+        if val_split is None:
+            val_split = self.val_split
+
+        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
+            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
+        )
+
+        # from distribution_matching.binary_debug import HACK
+        # posteriors, y = HACK(posteriors, y)
+
+        # print('[fit] precomputing stuff')
+
+        n = data.n_classes
+        #L = [posteriors[y==i] for i in range(n)]
+        #l = [len(Li) for Li in L]
+
+        D = n
+        h = self.bandwidth
+        #cov_mix_scalar = 2 * h * h  # corresponds to a bandwidth of sqrt(2)*h
+        #Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
+
+        # print('[fit] classifier ready; precomputing gram')
+        self.gram_tr_tr = gram_matrix_mix(h, posteriors)
+
+        # li_inv keeps track of the relative weight of each datapoint within its class 
+        # (i.e., the weight in its KDE model)
+        counts_inv = 1/(data.counts())
+        self.li_inv = counts_inv[y]
+
+#        Khash = {}
+#        for a in range(n):
+#            for b in range(l[a]):
+#                for i in range(n):
+#                    Khash[(a,b,i)] = sum(Kernel.pdf(L[i][j] - L[a][b]) for j in range(l[i]))
+                    # for j in range(l[i]):  # this for, and index j, can be supressed and store the sum across j
+                    #     Khash[(a, b, i, j)] = Kernel.pdf(L[i][j] - L[a][b])
+
+        self.n = n
+        #self.L = L
+        #self.l = l
+        #self.Kernel = Kernel
+        #self.Khash = Khash
+        self.C = ((2 * np.pi) ** (-D / 2)) * h ** (-D)
+        print('C:', self.C)
+        self.Ptr = posteriors
+        self.ytr = y
+
+        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), 'label name gaps not allowed in current implementation'
+
+        # print('[fit] exit')
+
+        return self
+
+
+    def aggregate(self, posteriors: np.ndarray):
+
+        # print('[aggregate] enter')
+
+        Ptr = self.Ptr
+        Pte = posteriors
+
+        gram_te_te = gram_matrix_mix(self.bandwidth, Pte, Pte)
+        gram_tr_te = gram_matrix_mix(self.bandwidth, Ptr, Pte)
+
+        K = Pte.shape[0]
+        tau = np.full(shape=K, fill_value=1/K, dtype=float)
+
+        h = self.bandwidth
+        D = Ptr.shape[1]
+        C = self.C
+
+        partC = 0.5 * np.log( C/K + 2 * tril_weighted_prod(tau, gram_te_te).sum())
+        
+        def match(alpha):
+
+            pi = alpha[self.ytr] * self.li_inv
+
+            partA = -np.log(weighted_prod(pi, tau, gram_tr_te).sum())
+            # print('gram_Tr_Tr sum', self.gram_tr_tr.sum())
+            # print('pretril', (np.triu(self.gram_tr_tr,1).sum()))
+            # print('tril', (2 * tril_weighted_prod(pi, self.gram_tr_tr).sum()))
+            # print('pi', pi.sum(), pi[:10])
+            # print('Cs', C*(pi**2).sum())
+            partB = 0.5 * np.log(C*(pi**2).sum() + 2*tril_weighted_prod(pi, self.gram_tr_tr).sum())
+
+            Dcs = partA + partB + partC
+
+            # print(f'{alpha=}\t{partA=}\t{partB=}\t{partC}')
+            # print()
+
+            return Dcs
+
+
+
+
+
+        # print('[aggregate] starts search')
+
+        # the initial point is set as the uniform distribution
+        uniform_distribution = np.full(fill_value=1 / self.n, shape=(self.n,))
+        # uniform_distribution = [0.2, 0.8]
+
+        # solutions are bounded to those contained in the unit-simplex
+        bounds = tuple((0, 1) for _ in range(self.n))  # values in [0,1]
+        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
+        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+        # print('[aggregate] end')
+        return r.x
+
+
--- a/distribution_matching/method_kdey_closed_efficient.py
+++ b/distribution_matching/method_kdey_closed_efficient.py
@ -0,0 +1,145 @@
+from cgi import test
+import os
+import sys
+from typing import Union, Callable
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LogisticRegression
+import pandas as pd
+from sklearn.model_selection import GridSearchCV
+from sklearn.neighbors import KernelDensity
+from scipy.stats import multivariate_normal
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, UPP
+from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
+    DistributionMatching, _get_divergence
+import scipy
+from scipy import optimize
+from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
+from time import time
+from sklearn.metrics.pairwise import rbf_kernel
+
+
+def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
+    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
+    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
+    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
+    variance = 2 * (bandwidth**2)
+    nRows,nD = X.shape
+    gamma = 1/(2*variance)
+    gram = rbf_kernel(X, Y, gamma=gamma)
+
+    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
+    gram *= norm_factor
+    if Y is None:
+        # ignores the diagonal
+        aggr = (2 * np.triu(gram, 1)).sum()
+    else:
+        aggr = gram.sum()
+    return aggr
+
+
+class KDEyclosed_efficient(AggregativeProbabilisticQuantifier):
+
+    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
+        self.classifier = classifier
+        self.val_split = val_split
+        self.bandwidth = bandwidth
+        self.n_jobs = n_jobs
+        self.random_state=random_state
+
+    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
+        """
+
+        :param data: the training set
+        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
+        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
+         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
+        """
+
+        # print('[fit] enter')
+        if val_split is None:
+            val_split = self.val_split
+
+        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
+            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
+        )
+
+        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
+            'label name gaps not allowed in current implementation'
+
+        n = data.n_classes
+        h = self.bandwidth
+
+        P = posteriors
+        counts_inv = 1 / (data.counts())
+
+        nD = P.shape[1]
+        C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
+        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
+        self.tr_C = []
+        for i in range(n):
+            for j in range(n):
+                if i > j:
+                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
+                else:
+                    if i == j:
+                        tr_tr_sums[i, j] = gram_matrix_mix_sum(h, P[y == i])
+                        self.tr_C.append(C * sum(y == i))
+                    else:
+                        block = gram_matrix_mix_sum(h, P[y == i], P[y == j])
+                        tr_tr_sums[i, j] = block
+        self.tr_C = np.asarray(self.tr_C)
+        self.Ptr = posteriors
+        self.ytr = y
+        self.tr_tr_sums = tr_tr_sums
+        self.counts_inv = counts_inv
+
+        return self
+
+
+    def aggregate(self, posteriors: np.ndarray):
+
+        # print('[aggregate] enter')
+
+        Ptr = self.Ptr
+        Pte = posteriors
+
+        K,nD = Pte.shape
+        Kinv = (1/K)
+        h = self.bandwidth
+        n = Ptr.shape[1]
+        y = self.ytr
+        tr_tr_sums = self.tr_tr_sums
+
+        C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
+        partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv + C*Kinv)
+
+
+        tr_te_sums = np.zeros(shape=n, dtype=float)
+        for i in range(n):
+            tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
+
+        def match(alpha):
+            partA = -np.log((alpha * tr_te_sums).sum())
+            alpha_l = alpha * self.counts_inv
+            partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum() + (self.tr_C*(alpha_l**2)).sum())
+            return partA + partB + partC
+
+        # print('[aggregate] starts search')
+
+        # the initial point is set as the uniform distribution
+        uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
+        # uniform_distribution = [0.2, 0.8]
+
+        # solutions are bounded to those contained in the unit-simplex
+        bounds = tuple((0, 1) for _ in range(n))  # values in [0,1]
+        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
+        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+        # print('[aggregate] end')
+        return r.x
+
+
--- a/distribution_matching/method_kdey_closed_efficient_correct.py
+++ b/distribution_matching/method_kdey_closed_efficient_correct.py
@ -0,0 +1,133 @@
+from cgi import test
+import os
+import sys
+from typing import Union, Callable
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LogisticRegression
+import pandas as pd
+from sklearn.model_selection import GridSearchCV
+from sklearn.neighbors import KernelDensity
+from scipy.stats import multivariate_normal
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, UPP
+from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
+    DistributionMatching, _get_divergence
+import scipy
+from scipy import optimize
+from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
+from time import time
+from sklearn.metrics.pairwise import rbf_kernel
+
+
+def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
+    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
+    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
+    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
+    variance = 2 * (bandwidth**2)
+    nRows,nD = X.shape
+    gamma = 1/(2*variance)
+    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
+    gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
+    return gram.sum()
+
+
+class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
+
+    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
+        self.classifier = classifier
+        self.val_split = val_split
+        self.bandwidth = bandwidth
+        self.n_jobs = n_jobs
+        self.random_state=random_state
+
+    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
+        """
+
+        :param data: the training set
+        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
+        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
+         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
+        """
+
+        # print('[fit] enter')
+        if val_split is None:
+            val_split = self.val_split
+
+        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
+            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
+        )
+
+        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
+            'label name gaps not allowed in current implementation'
+
+        # print('[fit] precomputing stuff')
+
+        # from distribution_matching.binary_debug import HACK
+        # posteriors, y = HACK(posteriors, y)
+
+        n = data.n_classes
+        h = self.bandwidth
+
+        # print('[fit] classifier ready; precomputing gram')
+        P = posteriors
+
+        # li_inv keeps track of the relative weight of each datapoint within its class
+        # (i.e., the weight in its KDE model)
+        counts_inv = 1 / (data.counts())
+
+        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
+        for i in range(n):
+            for j in range(n):
+                if i > j:
+                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
+                else:
+                    block = gram_matrix_mix_sum(h, P[y == i], P[y == j] if i!=j else None)
+                    tr_tr_sums[i, j] = block
+
+        # compute class-class block-sums
+        self.Ptr = posteriors
+        self.ytr = y
+        self.tr_tr_sums = tr_tr_sums
+        self.counts_inv = counts_inv
+
+        return self
+
+
+    def aggregate(self, posteriors: np.ndarray):
+
+        Ptr = self.Ptr
+        Pte = posteriors
+
+        K,nD = Pte.shape
+        Kinv = (1/K)
+        h = self.bandwidth
+        n = Ptr.shape[1]
+        y = self.ytr
+        tr_tr_sums = self.tr_tr_sums
+
+        partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv)
+
+        tr_te_sums = np.zeros(shape=n, dtype=float)
+        for i in range(n):
+            tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
+
+        def match(alpha):
+            partA = -np.log((alpha * tr_te_sums).sum())
+            alpha_l = alpha * self.counts_inv
+            partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
+            return partA + partB + partC
+
+        # the initial point is set as the uniform distribution
+        uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
+
+        # solutions are bounded to those contained in the unit-simplex
+        bounds = tuple((0, 1) for _ in range(n))  # values in [0,1]
+        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
+        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+        return r.x
+
+
--- a/distribution_matching/todo.txt
+++ b/distribution_matching/todo.txt
@ -1,49 +1,21 @@
-Cosa fundamental:
-KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase,
-y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es
-generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo
-a maximizar.
+1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error
+2.- Por ahora tengo KLD y HD:
+    - para KLD no he entendido si tengo que añadir el -x + y
+3.- Se puede poner la topsoe como una f-divergence?
+    La topsoe parece que es 2 veces la jensen-shannon divergence, o sea
+    topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2
+4.- Se puede poner la Wasserstein como una f-divergence?
+5.- En general, qué relación hay con las "distancias"?

- echar un ojo a los hyperparametros
- hacer dibujitos
- estudiar el caso en que el target es minimizar una divergencia. Posibilidades:
-    - evaluar los puntos de test solo
-    - evaluar un APP sobre el simplexo?
-    - evaluar un UPP sobre el simplexo? (=Montecarlo)
-    - qué divergencias? HD, topsoe, L1?
-    - tampoco estoy evaluando en modo kfcv creo...

-1) sacar lequa-kfcv y todos los kfcv que puedan tener sentido en tweets
 2) implementar el auto
    - optimización interna para likelihood [ninguno parece funcionar bien]
        - de todo (e.g., todo el training)?
        - independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
    - optimización como un parámetro GridSearchQ
 6) optimizar kernel? optimizar distancia?
-7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría
-    eliminar el clasificador?):
-    https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional
-8) quitar la ultima dimension en sklearn también? No veo porqué
-9) optimizar para RAE en vez de AE? No va bien...
 10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
    por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
    EMD y meterlo en KDEy
 11) KDEx?
-12) Dirichlet (el método DIR) habría que arreglarlo y mostrar resultados...
-13) Test estadisticos.

-Notas:
-estoy probando a reemplazar el target max_likelihood con un min_divergence:
- como la divergencia entre dos KDEs ahora es en el espacio continuo, no es facil como obtener. Estoy probando 
-    con una evaluación en test, pero el problema es que es overconfident con respecto a la que ha sido obtenida en test.
-    Otra opción es un MonteCarlo que es lo que estoy probando ahora. Para este experimento he quitado la model selection 
-    del clasificador, y estoy dejando solo la que hace con el bandwidth por agilizarlo. Los resultados KDE-nomonte son un
-    max_likelihood en igualdad de condiciones (solo bandwidth), KDE-monte1 es un montecarlo con HD a 1000 puntos, y KDE-monte2
-    es lo mismo pero con 5000 puntos; ambos funcionan mal. KDE-monte1 y KDE-monte2 los voy a borrar.
-    Ahora estoy probando con KDE-monte3, lo mismo pero con una L2 como
-    divergencia. Parece mucho más parecido a KDE-nomonte (pero sigue siendo algo peor)
-    - probar con más puntos (KDE-monte4 es a 5000 puntos)
-    - habría que probar con topsoe (KDE-monte5)
-    - probar con optimización del LR (KDE-monte6 y con kfcv)
-    - probar con L1 en vez de L2 (KDE-monte7 con 5000 puntos y sin LR)
-    - tal vez habría que probar con la L2, que funciona bien, en el min_divergence que evaluaba en test, o test+train 
--- a/distribution_matching/tweets_experiments.py
+++ b/distribution_matching/tweets_experiments.py
@ -15,70 +15,71 @@ if __name__ == '__main__':
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
-    optim = 'mae'
-    result_dir = f'results/tweet/{optim}'
+    for optim in ['mae', 'mrae']:

-    os.makedirs(result_dir, exist_ok=True)
+        result_dir = f'results/tweet/{optim}'

-    for method in  METHODS:
-        
-        print('Init method', method)
+        os.makedirs(result_dir, exist_ok=True)

-        global_result_path = f'{result_dir}/{method}'
-        
-        if not os.path.exists(global_result_path+'.csv'):
-            with open(global_result_path+'.csv', 'wt') as csv:
-                csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')    
+        for method in METHODS:

-        with open(global_result_path+'.csv', 'at') as csv:
-            # four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
-            # this variable controls that the mod sel has already been done, and skip this otherwise
-            semeval_trained = False
+            print('Init method', method)

-            for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
-                print('init', dataset)
+            global_result_path = f'{result_dir}/{method}'

-                local_result_path = global_result_path + '_' + dataset
-                if os.path.exists(local_result_path+'.dataframe'):
-                    print(f'result file {local_result_path}.dataframe already exist; skipping')
-                    continue 
-                
-                with qp.util.temp_seed(SEED):
+            if not os.path.exists(global_result_path+'.csv'):
+                with open(global_result_path+'.csv', 'wt') as csv:
+                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')

-                    is_semeval = dataset.startswith('semeval')
+            with open(global_result_path+'.csv', 'at') as csv:
+                # four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
+                # this variable controls that the mod sel has already been done, and skip this otherwise
+                semeval_trained = False

-                    if not is_semeval or not semeval_trained:
+                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]:
+                    print('init', dataset)

-                        param_grid, quantifier = new_method(method)
+                    local_result_path = global_result_path + '_' + dataset
+                    if os.path.exists(local_result_path+'.dataframe'):
+                        print(f'result file {local_result_path}.dataframe already exist; skipping')
+                        continue

-                        # model selection
-                        data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
+                    with qp.util.temp_seed(SEED):

-                        protocol = UPP(data.test, repeats=n_bags_val)
-                        modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
+                        is_semeval = dataset.startswith('semeval')

-                        modsel.fit(data.training)
-                        print(f'best params {modsel.best_params_}')
-                        print(f'best score {modsel.best_score_}')
-                        pickle.dump(
-                            (modsel.best_params_, modsel.best_score_,), 
-                            open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+                        if not is_semeval or not semeval_trained:

-                        quantifier = modsel.best_model()
+                            param_grid, quantifier = new_method(method)

-                        if is_semeval:
-                            semeval_trained = True
-                    
-                    else:
-                        print(f'model selection for {dataset} already done; skipping')
+                            # model selection
+                            data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)

-                    data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
-                    quantifier.fit(data.training)
-                    protocol = UPP(data.test, repeats=n_bags_test)
-                    report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
-                    report.to_csv(f'{local_result_path}.dataframe')
-                    means = report.mean()
-                    csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
-                    csv.flush()
+                            protocol = UPP(data.test, repeats=n_bags_val)
+                            modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)

-    show_results(global_result_path)
+                            modsel.fit(data.training)
+                            print(f'best params {modsel.best_params_}')
+                            print(f'best score {modsel.best_score_}')
+                            pickle.dump(
+                                (modsel.best_params_, modsel.best_score_,),
+                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+
+                            quantifier = modsel.best_model()
+
+                            if is_semeval:
+                                semeval_trained = True
+
+                        else:
+                            print(f'model selection for {dataset} already done; skipping')
+
+                        data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
+                        quantifier.fit(data.training)
+                        protocol = UPP(data.test, repeats=n_bags_test)
+                        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
+                        report.to_csv(f'{local_result_path}.dataframe')
+                        means = report.mean()
+                        csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
+                        csv.flush()
+
+            show_results(global_result_path)
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@ -1,3 +1,17 @@
+Change Log 0.1.8
+----------------
+
+- A conceptual error in MedianSweep and MedianSweep2 has been solved. The error consisted on computing all TPRs and
+    FPRs and report the median; then, the adjustment then operated on these single values. Instead, the original
+    method proposed by G.Forman comes down to generating all prevalence predictions, for all TPRs and FPRs, and then
+    computing the median of it.
+
+- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
+    qp.evaluation because I don't like the qp.evaluation.evaluate thing>
+
+- <fix> remove dependencies with LabelledCollection in the library.
+
+
 Change Log 0.1.7
 ----------------

--- a/quapy/classification/neural.py
+++ b/quapy/classification/neural.py
@ -176,7 +176,7 @@ class NeuralClassifierTrainer:
        self.classes_ = train.classes_
        opt = self.trainer_hyperparams
        checkpoint = self.checkpointpath
-        self.reset_net_params(self.vocab_size, train.n_classes)
+        self.reset_net_params(self.vocab_size, train.arange_classes)

        train_generator = TorchDataset(train.instances, train.labels).asDataloader(
            opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -269,6 +269,7 @@ class LabelledCollection:
        test = self.sampling_from_index(right)
        return training, test

+
    def __add__(self, other):
        """
        Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import Union, Callable, Iterable
 import numpy as np
 from tqdm import tqdm
@ -12,7 +13,8 @@ def prediction(
        protocol: AbstractProtocol,
        aggr_speedup: Union[str, bool] = 'auto',
        verbose=False,
-        verbose_error=None):
+        verbose_error=None,
+        n_jobs=1):
    """
    Uses a quantification model to generate predictions for the samples generated via a specific protocol.
    This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
@ -34,6 +36,10 @@ def prediction(
        convenient or not. Set to False to deactivate.
    :param verbose: boolean, show or not information in stdout
    :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
+    :param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
+        is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
+        N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
+        adding an overhead for cloning the objects within different threads that is often not worth the effort.
    :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
        `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
    """
@ -63,21 +69,44 @@ def prediction(
    if apply_optimization:
        pre_classified = model.classify(protocol.get_labelled_collection().instances)
        protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
-        return __prediction_helper(model.aggregate, protocol_with_predictions, verbose, verbose_error)
+        return __prediction_helper(model, protocol_with_predictions, True, verbose, verbose_error, n_jobs)
    else:
-        return __prediction_helper(model.quantify, protocol, verbose, verbose_error)
+        return __prediction_helper(model, protocol, False, verbose, verbose_error, n_jobs)


-def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False, verbose_error=None):
+def __delayed_prediction(args):
+    quantifier, aggregate, sample_instances, sample_prev = args
+    quantifier = deepcopy(quantifier)
+    quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
+    predicted = quant_fn(sample_instances)
+    return sample_prev, predicted
+
+
+def __prediction_helper(quantifier, protocol: AbstractProtocol, aggregate: bool, verbose=False, verbose_error=None, n_jobs=1):
    true_prevs, estim_prevs = [], []
+    ongoing_errors = []
    if verbose:
        pbar =  tqdm(protocol(), total=protocol.total(), desc='predicting')
-    for sample_instances, sample_prev in pbar if verbose else protocol():
-        estim_prevs.append(quantification_fn(sample_instances))
-        true_prevs.append(sample_prev)
-        if verbose and verbose_error is not None:
-            err = verbose_error(true_prevs, estim_prevs)
-            pbar.set_description('predicting: ongoing error={err:.5f}')
+    if n_jobs==1:
+        quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
+        for sample_instances, sample_prev in pbar if verbose else protocol():
+            predicted = quant_fn(sample_instances)
+            estim_prevs.append(predicted)
+            true_prevs.append(sample_prev)
+            if verbose and verbose_error is not None:
+                err = verbose_error(sample_prev, predicted)
+                ongoing_errors.append(err)
+                pbar.set_description(f'predicting: ongoing error={np.mean(ongoing_errors):.5f}')
+    else:
+        if verbose:
+            print('parallelizing prediction')
+        outputs = qp.util.parallel(
+            __delayed_prediction,
+            ((quantifier, aggregate, sample_X, sample_p) for (sample_X, sample_p) in (pbar if verbose else protocol())),
+            seed=qp.environ.get('_R_SEED', None),
+            n_jobs=n_jobs
+        )
+        true_prevs, estim_prevs = list(zip(*outputs))

    true_prevs = np.asarray(true_prevs)
    estim_prevs = np.asarray(estim_prevs)
@ -89,7 +118,7 @@ def evaluation_report(model: BaseQuantifier,
                      protocol: AbstractProtocol,
                      error_metrics: Iterable[Union[str,Callable]] = 'mae',
                      aggr_speedup: Union[str, bool] = 'auto',
-                      verbose=False):
+                      verbose=False, verbose_error=None, n_jobs=1):
    """
    Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
    to a specific protocol and in terms of one or more evaluation metrics (errors).
@ -107,12 +136,19 @@ def evaluation_report(model: BaseQuantifier,
        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
        convenient or not. Set to False to deactivate.
    :param verbose: boolean, show or not information in stdout
+    :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
+    :param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
+        is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
+        N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
+        adding an overhead for cloning the objects within different threads that is often not worth the effort.
    :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
        'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
        have been indicated, each displaying the score in terms of that metric for every sample.
    """

-    true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
+    true_prevs, estim_prevs = prediction(
+        model, protocol, aggr_speedup=aggr_speedup, verbose=verbose, verbose_error=verbose_error, n_jobs=n_jobs
+    )
    return _prevalence_report(true_prevs, estim_prevs, error_metrics)


--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@ -32,7 +32,7 @@ class QuaNetTrainer(BaseQuantifier):
    >>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
    >>>
    >>> # the text classifier is a CNN trained by NeuralClassifierTrainer
-    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
+    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.arange_classes)
    >>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
    >>>
    >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -54,6 +54,7 @@ class GridSearchQ(BaseQuantifier):
        self.__check_error(error)
        assert isinstance(protocol, AbstractProtocol), 'unknown protocol'

+
    def _sout(self, msg):
        if self.verbose:
            print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	7593fde2e0	KDE with closed form working	2023-10-13 17:34:26 +02:00
Alejandro Moreo Fernandez	32d6aa58f6	understanding montecarlo sampling	2023-10-02 17:50:12 +02:00