KDE with closed form working

2023-10-13 17:34:26 +02:00 · 2023-10-13 17:34:26 +02:00 · 7593fde2e0
parent 32d6aa58f6
commit 7593fde2e0
17 changed files with 667 additions and 29 deletions
--- a/distribution_matching/binary_experiments.py
+++ b/distribution_matching/binary_experiments.py
@ -51,7 +51,7 @@ if __name__ == '__main__':
                        # model selection
                        train, test = data.train_test
-                        train, val = train.split_stratified()
+                        train, val = train.split_stratified(random_state=SEED)
                        protocol = UPP(val, repeats=n_bags_val)
                        modsel = GridSearchQ(
--- a/distribution_matching/commons.py
+++ b/distribution_matching/commons.py
@ -1,12 +1,14 @@
 import numpy as np
 import pandas as pd
 from distribution_matching.method_kdey import KDEy
 from distribution_matching.method_kdey_closed import KDEyclosed
 from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
 from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
 from distribution_matching.method_dirichlety import DIRy
 from sklearn.linear_model import LogisticRegression
 from method_kdey_closed_efficient import KDEyclosed_efficient
-
+METHODS  = ['KDEy-closed++', 'KDEy-closed+', 'KDEy-closed', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
 METHODS  = ['KDEy-DMjs', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD']
 BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
@ -32,9 +34,21 @@ def new_method(method, **lr_kwargs):
        param_grid = hyper_LR
        quantifier = PACC(lr)
    elif method == 'KDEy-ML':
-        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='max_likelihood', val_split=10)
    elif method == 'KDEy-closed':
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEyclosed(lr, val_split=10)
    elif method == 'KDEy-closed+':
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEyclosed_efficient(lr, val_split=10)
    elif method == 'KDEy-closed++':
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEyclosed_efficient_corr(lr, val_split=10)
    elif method in ['KDEy-DM']:
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
@ -62,10 +76,11 @@ def new_method(method, **lr_kwargs):
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
-    elif method in ['KDEy-DMhd']:
+    # elif method in ['KDEy-DMhd']:
-        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+    #     The code to reproduce this run is commented in the min_divergence target, I think it was incorrect...
-        param_grid = {**method_params, **hyper_LR}
+    #     method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
-        quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
+    #     param_grid = {**method_params, **hyper_LR}
    #     quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
    elif method in ['KDEy-DMhd2']:
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
@ -74,6 +89,15 @@ def new_method(method, **lr_kwargs):
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10)
    elif method in ['KDEy-DMhd3']:
        # I have realized that there was an error. I am sampling from the validation distribution (V) and not from the
        # test distribution (T) just because the validation can be sampled in fit only once and pre-computed densities
        # can be stored. This means that the reference distribution is V and not T. Then I have found that an
        # f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
        # V then I am computing D(T||V) (and not D(V||T) as I thought).
        method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
    elif method == 'DM-HD':
        method_params = {
            'nbins': [4,8,16,32],
--- a/distribution_matching/figures/nclasses_plot.py
+++ b/distribution_matching/figures/nclasses_plot.py
@ -0,0 +1,29 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 """
 This script generates plots of sensibility for the number of classes
 Plots results for MAE, MRAE, and KLD
 The hyperparameters were set as: 
    quantifier.set_params(classifier__C=0.01, classifier__class_weight='balanced', bandwidth=0.2)
 """
 methods = ['DM', 'KDEy-ML', 'EMQ']
 optim = 'mae'
 dfs = [pd.read_csv(f'../results/lequa/nclasses/{optim}/{method}.csv', sep='\t') for method in methods]
 df = pd.concat(dfs)
 for err in ['MAE', 'MRAE']:
    piv = df.pivot_table(index='nClasses', columns='Method', values=err)
    g = sns.lineplot(data=piv, markers=True, dashes=False)
    g.set(xlim=(1, 28))
    g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    g.set_ylabel(err)
    g.set_xticks(np.linspace(1, 28, 28))
    plt.xticks(rotation=90)
    plt.grid()
    plt.savefig(f'./nclasses_{err}.pdf', bbox_inches='tight')
    plt.clf()
--- a/distribution_matching/figures/sensibility_plot.py
+++ b/distribution_matching/figures/sensibility_plot.py
@ -9,8 +9,8 @@ Plots results for MAE, MRAE, and KLD
 The rest of hyperparameters were set to their default values
 """
-df_tweet = pd.read_csv('../results_tweet_sensibility/KDEy-MLE.csv', sep='\t')
+df_tweet = pd.read_csv('../results/tweet/sensibility/KDEy-ML.csv', sep='\t')
-df_lequa = pd.read_csv('../results_lequa_sensibility/KDEy-MLE.csv', sep='\t')
+df_lequa = pd.read_csv('../results/lequa/sensibility/KDEy-ML.csv', sep='\t')
 df = pd.concat([df_tweet, df_lequa])
 for err in ['MAE', 'MRAE', 'KLD']:
--- a/distribution_matching/lequa_experiments.py
+++ b/distribution_matching/lequa_experiments.py
@ -52,10 +52,14 @@ if __name__ == '__main__':
                    print('debug mode... skipping model selection')
                    quantifier.fit(train)
-                report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
+                report = qp.evaluation.evaluation_report(
                    quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'],
                    verbose=True, verbose_error=optim[1:], n_jobs=-1
                )
                means = report.mean()
                report.to_csv(result_path+'.dataframe')
                csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                csv.flush()
                print(means)
        show_results(result_path)
--- a/distribution_matching/lequa_nclasses_sensibility.py
+++ b/distribution_matching/lequa_nclasses_sensibility.py
@ -0,0 +1,74 @@
 import pickle
 import numpy as np
 import os
 from os.path import join
 import pandas as pd
 from quapy.protocol import UPP
 from quapy.data import LabelledCollection
 from distribution_matching.commons import METHODS, new_method, show_results
 import quapy as qp
 SEED=1
 def extract_classes(data:LabelledCollection, classes):
    X, y = data.Xy
    counts = data.counts()
    Xs, ys = [], []
    for class_i in classes:
        Xs.append(X[y==class_i])
        ys.append([class_i]*counts[class_i])
    Xs = np.concatenate(Xs)
    ys = np.concatenate(ys)
    return LabelledCollection(Xs, ys, classes=classes
                              )
 def task(nclasses):
    in_classes = np.arange(0, nclasses)
    train = extract_classes(train_pool, classes=in_classes)
    test = extract_classes(test_pool, classes=in_classes)
    with qp.util.temp_seed(SEED):
        hyper, quantifier = new_method(method)
        quantifier.set_params(classifier__C=1, classifier__class_weight='balanced')
        hyper = {h:v for h,v in hyper.items() if not h.startswith('classifier__')}
        tr, va = train.split_stratified(random_state=SEED)
        quantifier = qp.model_selection.GridSearchQ(quantifier, hyper, UPP(va), optim).fit(tr)
        report = qp.evaluation.evaluation_report(quantifier, protocol=UPP(test), error_metrics=['mae', 'mrae', 'kld'], verbose=True)
        return report
 # only the quantifier-dependent hyperparameters are explored; the classifier is a LR with default parameters
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
    qp.environ['N_JOBS'] = -1
    for optim in ['mae']: #, 'mrae']:
        result_dir = f'results/lequa/nclasses/{optim}'
        os.makedirs(result_dir, exist_ok=True)
        for method in ['DM', 'EMQ', 'KDEy-ML']: # 'KDEy-ML', 'KDEy-DMhd3']:
            result_path = join(result_dir, f'{method}.csv')
            if os.path.exists(result_path): continue
            train_orig, _, _ = qp.datasets.fetch_lequa2022('T1B')
            train_pool, test_pool = train_orig.split_stratified(0.5, random_state=SEED)
            arange_classes = np.arange(2, train_orig.n_classes + 1)
            reports = qp.util.parallel(task, arange_classes, n_jobs=-1)
            with open(result_path, 'at') as csv:
                csv.write(f'Method\tDataset\tnClasses\tMAE\tMRAE\tKLD\n')
                for num_classes, report in zip(arange_classes, reports):
                    means = report.mean()
                    report_result_path = join(result_dir, f'{method}_{num_classes}')+'.dataframe'
                    report.to_csv(report_result_path)
                    csv.write(f'{method}\tLeQua-T1B\t{num_classes}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                    csv.flush()
            means = report.mean()
            print(means)
--- a/distribution_matching/method_kdey.py
+++ b/distribution_matching/method_kdey.py
@ -35,7 +35,8 @@ class KDEy(AggregativeProbabilisticQuantifier):
            f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
        assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
        assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
-        assert divergence in ['KLD', 'HD', 'JS'], 'in this version I will only allow KLD or squared HD as a divergence'
+        assert target=='max_likelihood' or divergence in ['KLD', 'HD', 'JS'], \
            'in this version I will only allow KLD or squared HD as a divergence'
        self.classifier = classifier
        self.val_split = val_split
        self.divergence = divergence
@ -250,7 +251,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
            test_likelihood = np.concatenate(
                [samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
            )
-            return fdivergence(val_likelihood, test_likelihood)
+            # return fdivergence(val_likelihood, test_likelihood)  # this is wrong, If I sample from the val distribution
            # then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
            return fdivergence(test_likelihood, val_likelihood)
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
--- a/distribution_matching/method_kdey_closed.py
+++ b/distribution_matching/method_kdey_closed.py
@ -0,0 +1,174 @@
 from cgi import test
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 from scipy.stats import multivariate_normal
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 def gram_matrix_mix(bandwidth, X, Y=None):
    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
    variance = 2 * (bandwidth**2)
    nD = X.shape[1]
    gamma = 1/(2*variance)
    gram = rbf_kernel(X, Y, gamma=gamma)
    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
    gram *= norm_factor
    print('GRAM SUM:', gram.sum())
    return gram
 def weighted_prod(pi, tau, G):
    return pi[:,np.newaxis] * G * tau
 def tril_weighted_prod(pi, G):
    M = weighted_prod(pi, pi, G)
    return np.triu(M,1)
 class KDEyclosed(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        # print('[fit] enter')
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        # from distribution_matching.binary_debug import HACK
        # posteriors, y = HACK(posteriors, y)
        # print('[fit] precomputing stuff')
        n = data.n_classes
        #L = [posteriors[y==i] for i in range(n)]
        #l = [len(Li) for Li in L]
        D = n
        h = self.bandwidth
        #cov_mix_scalar = 2 * h * h  # corresponds to a bandwidth of sqrt(2)*h
        #Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
        # print('[fit] classifier ready; precomputing gram')
        self.gram_tr_tr = gram_matrix_mix(h, posteriors)
        # li_inv keeps track of the relative weight of each datapoint within its class 
        # (i.e., the weight in its KDE model)
        counts_inv = 1/(data.counts())
        self.li_inv = counts_inv[y]
 #        Khash = {}
 #        for a in range(n):
 #            for b in range(l[a]):
 #                for i in range(n):
 #                    Khash[(a,b,i)] = sum(Kernel.pdf(L[i][j] - L[a][b]) for j in range(l[i]))
                    # for j in range(l[i]):  # this for, and index j, can be supressed and store the sum across j
                    #     Khash[(a, b, i, j)] = Kernel.pdf(L[i][j] - L[a][b])
        self.n = n
        #self.L = L
        #self.l = l
        #self.Kernel = Kernel
        #self.Khash = Khash
        self.C = ((2 * np.pi) ** (-D / 2)) * h ** (-D)
        print('C:', self.C)
        self.Ptr = posteriors
        self.ytr = y
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), 'label name gaps not allowed in current implementation'
        # print('[fit] exit')
        return self
    def aggregate(self, posteriors: np.ndarray):
        # print('[aggregate] enter')
        Ptr = self.Ptr
        Pte = posteriors
        gram_te_te = gram_matrix_mix(self.bandwidth, Pte, Pte)
        gram_tr_te = gram_matrix_mix(self.bandwidth, Ptr, Pte)
        K = Pte.shape[0]
        tau = np.full(shape=K, fill_value=1/K, dtype=float)
        h = self.bandwidth
        D = Ptr.shape[1]
        C = self.C
        partC = 0.5 * np.log( C/K + 2 * tril_weighted_prod(tau, gram_te_te).sum())
        def match(alpha):
            pi = alpha[self.ytr] * self.li_inv
            partA = -np.log(weighted_prod(pi, tau, gram_tr_te).sum())
            # print('gram_Tr_Tr sum', self.gram_tr_tr.sum())
            # print('pretril', (np.triu(self.gram_tr_tr,1).sum()))
            # print('tril', (2 * tril_weighted_prod(pi, self.gram_tr_tr).sum()))
            # print('pi', pi.sum(), pi[:10])
            # print('Cs', C*(pi**2).sum())
            partB = 0.5 * np.log(C*(pi**2).sum() + 2*tril_weighted_prod(pi, self.gram_tr_tr).sum())
            Dcs = partA + partB + partC
            # print(f'{alpha=}\t{partA=}\t{partB=}\t{partC}')
            # print()
            return Dcs
        # print('[aggregate] starts search')
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / self.n, shape=(self.n,))
        # uniform_distribution = [0.2, 0.8]
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(self.n))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        # print('[aggregate] end')
        return r.x
--- a/distribution_matching/method_kdey_closed_efficient.py
+++ b/distribution_matching/method_kdey_closed_efficient.py
@ -0,0 +1,145 @@
 from cgi import test
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 from scipy.stats import multivariate_normal
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
    variance = 2 * (bandwidth**2)
    nRows,nD = X.shape
    gamma = 1/(2*variance)
    gram = rbf_kernel(X, Y, gamma=gamma)
    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
    gram *= norm_factor
    if Y is None:
        # ignores the diagonal
        aggr = (2 * np.triu(gram, 1)).sum()
    else:
        aggr = gram.sum()
    return aggr
 class KDEyclosed_efficient(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        # print('[fit] enter')
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
            'label name gaps not allowed in current implementation'
        n = data.n_classes
        h = self.bandwidth
        P = posteriors
        counts_inv = 1 / (data.counts())
        nD = P.shape[1]
        C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
        self.tr_C = []
        for i in range(n):
            for j in range(n):
                if i > j:
                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
                else:
                    if i == j:
                        tr_tr_sums[i, j] = gram_matrix_mix_sum(h, P[y == i])
                        self.tr_C.append(C * sum(y == i))
                    else:
                        block = gram_matrix_mix_sum(h, P[y == i], P[y == j])
                        tr_tr_sums[i, j] = block
        self.tr_C = np.asarray(self.tr_C)
        self.Ptr = posteriors
        self.ytr = y
        self.tr_tr_sums = tr_tr_sums
        self.counts_inv = counts_inv
        return self
    def aggregate(self, posteriors: np.ndarray):
        # print('[aggregate] enter')
        Ptr = self.Ptr
        Pte = posteriors
        K,nD = Pte.shape
        Kinv = (1/K)
        h = self.bandwidth
        n = Ptr.shape[1]
        y = self.ytr
        tr_tr_sums = self.tr_tr_sums
        C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
        partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv + C*Kinv)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
            tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
        def match(alpha):
            partA = -np.log((alpha * tr_te_sums).sum())
            alpha_l = alpha * self.counts_inv
            partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum() + (self.tr_C*(alpha_l**2)).sum())
            return partA + partB + partC
        # print('[aggregate] starts search')
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
        # uniform_distribution = [0.2, 0.8]
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(n))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        # print('[aggregate] end')
        return r.x
--- a/distribution_matching/method_kdey_closed_efficient_correct.py
+++ b/distribution_matching/method_kdey_closed_efficient_correct.py
@ -0,0 +1,133 @@
 from cgi import test
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 from scipy.stats import multivariate_normal
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
    variance = 2 * (bandwidth**2)
    nRows,nD = X.shape
    gamma = 1/(2*variance)
    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
    gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
    return gram.sum()
 class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        # print('[fit] enter')
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
            'label name gaps not allowed in current implementation'
        # print('[fit] precomputing stuff')
        # from distribution_matching.binary_debug import HACK
        # posteriors, y = HACK(posteriors, y)
        n = data.n_classes
        h = self.bandwidth
        # print('[fit] classifier ready; precomputing gram')
        P = posteriors
        # li_inv keeps track of the relative weight of each datapoint within its class
        # (i.e., the weight in its KDE model)
        counts_inv = 1 / (data.counts())
        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
        for i in range(n):
            for j in range(n):
                if i > j:
                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
                else:
                    block = gram_matrix_mix_sum(h, P[y == i], P[y == j] if i!=j else None)
                    tr_tr_sums[i, j] = block
        # compute class-class block-sums
        self.Ptr = posteriors
        self.ytr = y
        self.tr_tr_sums = tr_tr_sums
        self.counts_inv = counts_inv
        return self
    def aggregate(self, posteriors: np.ndarray):
        Ptr = self.Ptr
        Pte = posteriors
        K,nD = Pte.shape
        Kinv = (1/K)
        h = self.bandwidth
        n = Ptr.shape[1]
        y = self.ytr
        tr_tr_sums = self.tr_tr_sums
        partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
            tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
        def match(alpha):
            partA = -np.log((alpha * tr_te_sums).sum())
            alpha_l = alpha * self.counts_inv
            partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
            return partA + partB + partC
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(n))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
--- a/distribution_matching/tweets_experiments.py
+++ b/distribution_matching/tweets_experiments.py
@ -36,7 +36,7 @@ if __name__ == '__main__':
                # this variable controls that the mod sel has already been done, and skip this otherwise
                semeval_trained = False
-                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
+                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]:
                    print('init', dataset)
                    local_result_path = global_result_path + '_' + dataset
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@ -1,3 +1,17 @@
 Change Log 0.1.8
 ----------------
 - A conceptual error in MedianSweep and MedianSweep2 has been solved. The error consisted on computing all TPRs and
    FPRs and report the median; then, the adjustment then operated on these single values. Instead, the original
    method proposed by G.Forman comes down to generating all prevalence predictions, for all TPRs and FPRs, and then
    computing the median of it.
 - qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
    qp.evaluation because I don't like the qp.evaluation.evaluate thing>
 - <fix> remove dependencies with LabelledCollection in the library.
 Change Log 0.1.7
 ----------------
--- a/quapy/classification/neural.py
+++ b/quapy/classification/neural.py
@ -176,7 +176,7 @@ class NeuralClassifierTrainer:
        self.classes_ = train.classes_
        opt = self.trainer_hyperparams
        checkpoint = self.checkpointpath
-        self.reset_net_params(self.vocab_size, train.n_classes)
+        self.reset_net_params(self.vocab_size, train.arange_classes)
        train_generator = TorchDataset(train.instances, train.labels).asDataloader(
            opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -269,6 +269,7 @@ class LabelledCollection:
        test = self.sampling_from_index(right)
        return training, test
    def __add__(self, other):
        """
        Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -1,3 +1,4 @@
 from copy import deepcopy
 from typing import Union, Callable, Iterable
 import numpy as np
 from tqdm import tqdm
@ -12,7 +13,8 @@ def prediction(
        protocol: AbstractProtocol,
        aggr_speedup: Union[str, bool] = 'auto',
        verbose=False,
-        verbose_error=None):
+        verbose_error=None,
        n_jobs=1):
    """
    Uses a quantification model to generate predictions for the samples generated via a specific protocol.
    This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
@ -34,6 +36,10 @@ def prediction(
        convenient or not. Set to False to deactivate.
    :param verbose: boolean, show or not information in stdout
    :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
    :param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
        is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
        N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
        adding an overhead for cloning the objects within different threads that is often not worth the effort.
    :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
        `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
    """
@ -63,21 +69,44 @@ def prediction(
    if apply_optimization:
        pre_classified = model.classify(protocol.get_labelled_collection().instances)
        protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
-        return __prediction_helper(model.aggregate, protocol_with_predictions, verbose, verbose_error)
+        return __prediction_helper(model, protocol_with_predictions, True, verbose, verbose_error, n_jobs)
    else:
-        return __prediction_helper(model.quantify, protocol, verbose, verbose_error)
+        return __prediction_helper(model, protocol, False, verbose, verbose_error, n_jobs)
-def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False, verbose_error=None):
+def __delayed_prediction(args):
    quantifier, aggregate, sample_instances, sample_prev = args
    quantifier = deepcopy(quantifier)
    quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
    predicted = quant_fn(sample_instances)
    return sample_prev, predicted
 def __prediction_helper(quantifier, protocol: AbstractProtocol, aggregate: bool, verbose=False, verbose_error=None, n_jobs=1):
    true_prevs, estim_prevs = [], []
    ongoing_errors = []
    if verbose:
        pbar =  tqdm(protocol(), total=protocol.total(), desc='predicting')
-    for sample_instances, sample_prev in pbar if verbose else protocol():
+    if n_jobs==1:
-        estim_prevs.append(quantification_fn(sample_instances))
+        quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
-        true_prevs.append(sample_prev)
+        for sample_instances, sample_prev in pbar if verbose else protocol():
-        if verbose and verbose_error is not None:
+            predicted = quant_fn(sample_instances)
-            err = verbose_error(true_prevs, estim_prevs)
+            estim_prevs.append(predicted)
-            pbar.set_description('predicting: ongoing error={err:.5f}')
+            true_prevs.append(sample_prev)
            if verbose and verbose_error is not None:
                err = verbose_error(sample_prev, predicted)
                ongoing_errors.append(err)
                pbar.set_description(f'predicting: ongoing error={np.mean(ongoing_errors):.5f}')
    else:
        if verbose:
            print('parallelizing prediction')
        outputs = qp.util.parallel(
            __delayed_prediction,
            ((quantifier, aggregate, sample_X, sample_p) for (sample_X, sample_p) in (pbar if verbose else protocol())),
            seed=qp.environ.get('_R_SEED', None),
            n_jobs=n_jobs
        )
        true_prevs, estim_prevs = list(zip(*outputs))
    true_prevs = np.asarray(true_prevs)
    estim_prevs = np.asarray(estim_prevs)
@ -89,7 +118,7 @@ def evaluation_report(model: BaseQuantifier,
                      protocol: AbstractProtocol,
                      error_metrics: Iterable[Union[str,Callable]] = 'mae',
                      aggr_speedup: Union[str, bool] = 'auto',
-                      verbose=False):
+                      verbose=False, verbose_error=None, n_jobs=1):
    """
    Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
    to a specific protocol and in terms of one or more evaluation metrics (errors).
@ -107,12 +136,19 @@ def evaluation_report(model: BaseQuantifier,
        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
        convenient or not. Set to False to deactivate.
    :param verbose: boolean, show or not information in stdout
    :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
    :param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
        is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
        N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
        adding an overhead for cloning the objects within different threads that is often not worth the effort.
    :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
        'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
        have been indicated, each displaying the score in terms of that metric for every sample.
    """
-    true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
+    true_prevs, estim_prevs = prediction(
        model, protocol, aggr_speedup=aggr_speedup, verbose=verbose, verbose_error=verbose_error, n_jobs=n_jobs
    )
    return _prevalence_report(true_prevs, estim_prevs, error_metrics)
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@ -32,7 +32,7 @@ class QuaNetTrainer(BaseQuantifier):
    >>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
    >>>
    >>> # the text classifier is a CNN trained by NeuralClassifierTrainer
-    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
+    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.arange_classes)
    >>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
    >>>
    >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -54,6 +54,7 @@ class GridSearchQ(BaseQuantifier):
        self.__check_error(error)
        assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
    def _sout(self, msg):
        if self.verbose:
            print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')