KDE with closed form working
This commit is contained in:
parent
32d6aa58f6
commit
7593fde2e0
|
@ -51,7 +51,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# model selection
|
# model selection
|
||||||
train, test = data.train_test
|
train, test = data.train_test
|
||||||
train, val = train.split_stratified()
|
train, val = train.split_stratified(random_state=SEED)
|
||||||
|
|
||||||
protocol = UPP(val, repeats=n_bags_val)
|
protocol = UPP(val, repeats=n_bags_val)
|
||||||
modsel = GridSearchQ(
|
modsel = GridSearchQ(
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from distribution_matching.method_kdey import KDEy
|
from distribution_matching.method_kdey import KDEy
|
||||||
|
from distribution_matching.method_kdey_closed import KDEyclosed
|
||||||
|
from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
|
||||||
from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
|
from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
|
||||||
from distribution_matching.method_dirichlety import DIRy
|
from distribution_matching.method_dirichlety import DIRy
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from method_kdey_closed_efficient import KDEyclosed_efficient
|
||||||
|
|
||||||
|
METHODS = ['KDEy-closed++', 'KDEy-closed+', 'KDEy-closed', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
|
||||||
METHODS = ['KDEy-DMjs', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD']
|
|
||||||
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,9 +34,21 @@ def new_method(method, **lr_kwargs):
|
||||||
param_grid = hyper_LR
|
param_grid = hyper_LR
|
||||||
quantifier = PACC(lr)
|
quantifier = PACC(lr)
|
||||||
elif method == 'KDEy-ML':
|
elif method == 'KDEy-ML':
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
||||||
|
elif method == 'KDEy-closed':
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEyclosed(lr, val_split=10)
|
||||||
|
elif method == 'KDEy-closed+':
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEyclosed_efficient(lr, val_split=10)
|
||||||
|
elif method == 'KDEy-closed++':
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEyclosed_efficient_corr(lr, val_split=10)
|
||||||
elif method in ['KDEy-DM']:
|
elif method in ['KDEy-DM']:
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
@ -62,10 +76,11 @@ def new_method(method, **lr_kwargs):
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
|
quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
|
||||||
elif method in ['KDEy-DMhd']:
|
# elif method in ['KDEy-DMhd']:
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
# The code to reproduce this run is commented in the min_divergence target, I think it was incorrect...
|
||||||
param_grid = {**method_params, **hyper_LR}
|
# method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
# param_grid = {**method_params, **hyper_LR}
|
||||||
|
# quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
||||||
elif method in ['KDEy-DMhd2']:
|
elif method in ['KDEy-DMhd2']:
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
@ -74,6 +89,15 @@ def new_method(method, **lr_kwargs):
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10)
|
quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10)
|
||||||
|
elif method in ['KDEy-DMhd3']:
|
||||||
|
# I have realized that there was an error. I am sampling from the validation distribution (V) and not from the
|
||||||
|
# test distribution (T) just because the validation can be sampled in fit only once and pre-computed densities
|
||||||
|
# can be stored. This means that the reference distribution is V and not T. Then I have found that an
|
||||||
|
# f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
|
||||||
|
# V then I am computing D(T||V) (and not D(V||T) as I thought).
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
||||||
elif method == 'DM-HD':
|
elif method == 'DM-HD':
|
||||||
method_params = {
|
method_params = {
|
||||||
'nbins': [4,8,16,32],
|
'nbins': [4,8,16,32],
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates plots of sensibility for the number of classes
|
||||||
|
Plots results for MAE, MRAE, and KLD
|
||||||
|
The hyperparameters were set as:
|
||||||
|
quantifier.set_params(classifier__C=0.01, classifier__class_weight='balanced', bandwidth=0.2)
|
||||||
|
"""
|
||||||
|
|
||||||
|
methods = ['DM', 'KDEy-ML', 'EMQ']
|
||||||
|
optim = 'mae'
|
||||||
|
dfs = [pd.read_csv(f'../results/lequa/nclasses/{optim}/{method}.csv', sep='\t') for method in methods]
|
||||||
|
df = pd.concat(dfs)
|
||||||
|
|
||||||
|
|
||||||
|
for err in ['MAE', 'MRAE']:
|
||||||
|
piv = df.pivot_table(index='nClasses', columns='Method', values=err)
|
||||||
|
g = sns.lineplot(data=piv, markers=True, dashes=False)
|
||||||
|
g.set(xlim=(1, 28))
|
||||||
|
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
|
||||||
|
g.set_ylabel(err)
|
||||||
|
g.set_xticks(np.linspace(1, 28, 28))
|
||||||
|
plt.xticks(rotation=90)
|
||||||
|
plt.grid()
|
||||||
|
plt.savefig(f'./nclasses_{err}.pdf', bbox_inches='tight')
|
||||||
|
plt.clf()
|
|
@ -9,8 +9,8 @@ Plots results for MAE, MRAE, and KLD
|
||||||
The rest of hyperparameters were set to their default values
|
The rest of hyperparameters were set to their default values
|
||||||
"""
|
"""
|
||||||
|
|
||||||
df_tweet = pd.read_csv('../results_tweet_sensibility/KDEy-MLE.csv', sep='\t')
|
df_tweet = pd.read_csv('../results/tweet/sensibility/KDEy-ML.csv', sep='\t')
|
||||||
df_lequa = pd.read_csv('../results_lequa_sensibility/KDEy-MLE.csv', sep='\t')
|
df_lequa = pd.read_csv('../results/lequa/sensibility/KDEy-ML.csv', sep='\t')
|
||||||
df = pd.concat([df_tweet, df_lequa])
|
df = pd.concat([df_tweet, df_lequa])
|
||||||
|
|
||||||
for err in ['MAE', 'MRAE', 'KLD']:
|
for err in ['MAE', 'MRAE', 'KLD']:
|
||||||
|
|
|
@ -52,10 +52,14 @@ if __name__ == '__main__':
|
||||||
print('debug mode... skipping model selection')
|
print('debug mode... skipping model selection')
|
||||||
quantifier.fit(train)
|
quantifier.fit(train)
|
||||||
|
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
report = qp.evaluation.evaluation_report(
|
||||||
|
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
verbose=True, verbose_error=optim[1:], n_jobs=-1
|
||||||
|
)
|
||||||
means = report.mean()
|
means = report.mean()
|
||||||
report.to_csv(result_path+'.dataframe')
|
report.to_csv(result_path+'.dataframe')
|
||||||
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
csv.flush()
|
csv.flush()
|
||||||
|
print(means)
|
||||||
|
|
||||||
show_results(result_path)
|
show_results(result_path)
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
import pandas as pd
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from distribution_matching.commons import METHODS, new_method, show_results
|
||||||
|
import quapy as qp
|
||||||
|
|
||||||
|
|
||||||
|
SEED=1
|
||||||
|
|
||||||
|
|
||||||
|
def extract_classes(data:LabelledCollection, classes):
|
||||||
|
X, y = data.Xy
|
||||||
|
counts = data.counts()
|
||||||
|
Xs, ys = [], []
|
||||||
|
for class_i in classes:
|
||||||
|
Xs.append(X[y==class_i])
|
||||||
|
ys.append([class_i]*counts[class_i])
|
||||||
|
Xs = np.concatenate(Xs)
|
||||||
|
ys = np.concatenate(ys)
|
||||||
|
return LabelledCollection(Xs, ys, classes=classes
|
||||||
|
)
|
||||||
|
|
||||||
|
def task(nclasses):
|
||||||
|
in_classes = np.arange(0, nclasses)
|
||||||
|
train = extract_classes(train_pool, classes=in_classes)
|
||||||
|
test = extract_classes(test_pool, classes=in_classes)
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
hyper, quantifier = new_method(method)
|
||||||
|
quantifier.set_params(classifier__C=1, classifier__class_weight='balanced')
|
||||||
|
hyper = {h:v for h,v in hyper.items() if not h.startswith('classifier__')}
|
||||||
|
tr, va = train.split_stratified(random_state=SEED)
|
||||||
|
quantifier = qp.model_selection.GridSearchQ(quantifier, hyper, UPP(va), optim).fit(tr)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol=UPP(test), error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
# only the quantifier-dependent hyperparameters are explored; the classifier is a LR with default parameters
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
|
||||||
|
|
||||||
|
for optim in ['mae']: #, 'mrae']:
|
||||||
|
|
||||||
|
result_dir = f'results/lequa/nclasses/{optim}'
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method in ['DM', 'EMQ', 'KDEy-ML']: # 'KDEy-ML', 'KDEy-DMhd3']:
|
||||||
|
|
||||||
|
result_path = join(result_dir, f'{method}.csv')
|
||||||
|
if os.path.exists(result_path): continue
|
||||||
|
|
||||||
|
train_orig, _, _ = qp.datasets.fetch_lequa2022('T1B')
|
||||||
|
|
||||||
|
train_pool, test_pool = train_orig.split_stratified(0.5, random_state=SEED)
|
||||||
|
arange_classes = np.arange(2, train_orig.n_classes + 1)
|
||||||
|
reports = qp.util.parallel(task, arange_classes, n_jobs=-1)
|
||||||
|
with open(result_path, 'at') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tnClasses\tMAE\tMRAE\tKLD\n')
|
||||||
|
for num_classes, report in zip(arange_classes, reports):
|
||||||
|
means = report.mean()
|
||||||
|
report_result_path = join(result_dir, f'{method}_{num_classes}')+'.dataframe'
|
||||||
|
report.to_csv(report_result_path)
|
||||||
|
csv.write(f'{method}\tLeQua-T1B\t{num_classes}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
means = report.mean()
|
||||||
|
print(means)
|
||||||
|
|
|
@ -35,7 +35,8 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
|
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
|
||||||
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
|
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
|
||||||
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
|
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
|
||||||
assert divergence in ['KLD', 'HD', 'JS'], 'in this version I will only allow KLD or squared HD as a divergence'
|
assert target=='max_likelihood' or divergence in ['KLD', 'HD', 'JS'], \
|
||||||
|
'in this version I will only allow KLD or squared HD as a divergence'
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
self.divergence = divergence
|
self.divergence = divergence
|
||||||
|
@ -250,7 +251,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
test_likelihood = np.concatenate(
|
test_likelihood = np.concatenate(
|
||||||
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
|
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
|
||||||
)
|
)
|
||||||
return fdivergence(val_likelihood, test_likelihood)
|
# return fdivergence(val_likelihood, test_likelihood) # this is wrong, If I sample from the val distribution
|
||||||
|
# then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
|
||||||
|
return fdivergence(test_likelihood, val_likelihood)
|
||||||
|
|
||||||
# the initial point is set as the uniform distribution
|
# the initial point is set as the uniform distribution
|
||||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
|
@ -0,0 +1,174 @@
|
||||||
|
from cgi import test
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
def gram_matrix_mix(bandwidth, X, Y=None):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
variance = 2 * (bandwidth**2)
|
||||||
|
nD = X.shape[1]
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
gram = rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram *= norm_factor
|
||||||
|
print('GRAM SUM:', gram.sum())
|
||||||
|
return gram
|
||||||
|
|
||||||
|
def weighted_prod(pi, tau, G):
|
||||||
|
return pi[:,np.newaxis] * G * tau
|
||||||
|
|
||||||
|
def tril_weighted_prod(pi, G):
|
||||||
|
M = weighted_prod(pi, pi, G)
|
||||||
|
return np.triu(M,1)
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyclosed(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
# print('[fit] enter')
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
# from distribution_matching.binary_debug import HACK
|
||||||
|
# posteriors, y = HACK(posteriors, y)
|
||||||
|
|
||||||
|
# print('[fit] precomputing stuff')
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
#L = [posteriors[y==i] for i in range(n)]
|
||||||
|
#l = [len(Li) for Li in L]
|
||||||
|
|
||||||
|
D = n
|
||||||
|
h = self.bandwidth
|
||||||
|
#cov_mix_scalar = 2 * h * h # corresponds to a bandwidth of sqrt(2)*h
|
||||||
|
#Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
|
||||||
|
|
||||||
|
# print('[fit] classifier ready; precomputing gram')
|
||||||
|
self.gram_tr_tr = gram_matrix_mix(h, posteriors)
|
||||||
|
|
||||||
|
# li_inv keeps track of the relative weight of each datapoint within its class
|
||||||
|
# (i.e., the weight in its KDE model)
|
||||||
|
counts_inv = 1/(data.counts())
|
||||||
|
self.li_inv = counts_inv[y]
|
||||||
|
|
||||||
|
# Khash = {}
|
||||||
|
# for a in range(n):
|
||||||
|
# for b in range(l[a]):
|
||||||
|
# for i in range(n):
|
||||||
|
# Khash[(a,b,i)] = sum(Kernel.pdf(L[i][j] - L[a][b]) for j in range(l[i]))
|
||||||
|
# for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
|
||||||
|
# Khash[(a, b, i, j)] = Kernel.pdf(L[i][j] - L[a][b])
|
||||||
|
|
||||||
|
self.n = n
|
||||||
|
#self.L = L
|
||||||
|
#self.l = l
|
||||||
|
#self.Kernel = Kernel
|
||||||
|
#self.Khash = Khash
|
||||||
|
self.C = ((2 * np.pi) ** (-D / 2)) * h ** (-D)
|
||||||
|
print('C:', self.C)
|
||||||
|
self.Ptr = posteriors
|
||||||
|
self.ytr = y
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), 'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
# print('[fit] exit')
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
# print('[aggregate] enter')
|
||||||
|
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
|
||||||
|
gram_te_te = gram_matrix_mix(self.bandwidth, Pte, Pte)
|
||||||
|
gram_tr_te = gram_matrix_mix(self.bandwidth, Ptr, Pte)
|
||||||
|
|
||||||
|
K = Pte.shape[0]
|
||||||
|
tau = np.full(shape=K, fill_value=1/K, dtype=float)
|
||||||
|
|
||||||
|
h = self.bandwidth
|
||||||
|
D = Ptr.shape[1]
|
||||||
|
C = self.C
|
||||||
|
|
||||||
|
partC = 0.5 * np.log( C/K + 2 * tril_weighted_prod(tau, gram_te_te).sum())
|
||||||
|
|
||||||
|
def match(alpha):
|
||||||
|
|
||||||
|
pi = alpha[self.ytr] * self.li_inv
|
||||||
|
|
||||||
|
partA = -np.log(weighted_prod(pi, tau, gram_tr_te).sum())
|
||||||
|
# print('gram_Tr_Tr sum', self.gram_tr_tr.sum())
|
||||||
|
# print('pretril', (np.triu(self.gram_tr_tr,1).sum()))
|
||||||
|
# print('tril', (2 * tril_weighted_prod(pi, self.gram_tr_tr).sum()))
|
||||||
|
# print('pi', pi.sum(), pi[:10])
|
||||||
|
# print('Cs', C*(pi**2).sum())
|
||||||
|
partB = 0.5 * np.log(C*(pi**2).sum() + 2*tril_weighted_prod(pi, self.gram_tr_tr).sum())
|
||||||
|
|
||||||
|
Dcs = partA + partB + partC
|
||||||
|
|
||||||
|
# print(f'{alpha=}\t{partA=}\t{partB=}\t{partC}')
|
||||||
|
# print()
|
||||||
|
|
||||||
|
return Dcs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# print('[aggregate] starts search')
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / self.n, shape=(self.n,))
|
||||||
|
# uniform_distribution = [0.2, 0.8]
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(self.n)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
# print('[aggregate] end')
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
from cgi import test
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
variance = 2 * (bandwidth**2)
|
||||||
|
nRows,nD = X.shape
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
gram = rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram *= norm_factor
|
||||||
|
if Y is None:
|
||||||
|
# ignores the diagonal
|
||||||
|
aggr = (2 * np.triu(gram, 1)).sum()
|
||||||
|
else:
|
||||||
|
aggr = gram.sum()
|
||||||
|
return aggr
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyclosed_efficient(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
|
||||||
|
# print('[fit] enter')
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
||||||
|
'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
h = self.bandwidth
|
||||||
|
|
||||||
|
P = posteriors
|
||||||
|
counts_inv = 1 / (data.counts())
|
||||||
|
|
||||||
|
nD = P.shape[1]
|
||||||
|
C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
|
||||||
|
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
|
||||||
|
self.tr_C = []
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
if i > j:
|
||||||
|
tr_tr_sums[i,j] = tr_tr_sums[j,i]
|
||||||
|
else:
|
||||||
|
if i == j:
|
||||||
|
tr_tr_sums[i, j] = gram_matrix_mix_sum(h, P[y == i])
|
||||||
|
self.tr_C.append(C * sum(y == i))
|
||||||
|
else:
|
||||||
|
block = gram_matrix_mix_sum(h, P[y == i], P[y == j])
|
||||||
|
tr_tr_sums[i, j] = block
|
||||||
|
self.tr_C = np.asarray(self.tr_C)
|
||||||
|
self.Ptr = posteriors
|
||||||
|
self.ytr = y
|
||||||
|
self.tr_tr_sums = tr_tr_sums
|
||||||
|
self.counts_inv = counts_inv
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
# print('[aggregate] enter')
|
||||||
|
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
|
||||||
|
K,nD = Pte.shape
|
||||||
|
Kinv = (1/K)
|
||||||
|
h = self.bandwidth
|
||||||
|
n = Ptr.shape[1]
|
||||||
|
y = self.ytr
|
||||||
|
tr_tr_sums = self.tr_tr_sums
|
||||||
|
|
||||||
|
C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
|
||||||
|
partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv + C*Kinv)
|
||||||
|
|
||||||
|
|
||||||
|
tr_te_sums = np.zeros(shape=n, dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
|
||||||
|
|
||||||
|
def match(alpha):
|
||||||
|
partA = -np.log((alpha * tr_te_sums).sum())
|
||||||
|
alpha_l = alpha * self.counts_inv
|
||||||
|
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum() + (self.tr_C*(alpha_l**2)).sum())
|
||||||
|
return partA + partB + partC
|
||||||
|
|
||||||
|
# print('[aggregate] starts search')
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
|
||||||
|
# uniform_distribution = [0.2, 0.8]
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
# print('[aggregate] end')
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
from cgi import test
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
variance = 2 * (bandwidth**2)
|
||||||
|
nRows,nD = X.shape
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
return gram.sum()
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
|
||||||
|
# print('[fit] enter')
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
||||||
|
'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
# print('[fit] precomputing stuff')
|
||||||
|
|
||||||
|
# from distribution_matching.binary_debug import HACK
|
||||||
|
# posteriors, y = HACK(posteriors, y)
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
h = self.bandwidth
|
||||||
|
|
||||||
|
# print('[fit] classifier ready; precomputing gram')
|
||||||
|
P = posteriors
|
||||||
|
|
||||||
|
# li_inv keeps track of the relative weight of each datapoint within its class
|
||||||
|
# (i.e., the weight in its KDE model)
|
||||||
|
counts_inv = 1 / (data.counts())
|
||||||
|
|
||||||
|
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
if i > j:
|
||||||
|
tr_tr_sums[i,j] = tr_tr_sums[j,i]
|
||||||
|
else:
|
||||||
|
block = gram_matrix_mix_sum(h, P[y == i], P[y == j] if i!=j else None)
|
||||||
|
tr_tr_sums[i, j] = block
|
||||||
|
|
||||||
|
# compute class-class block-sums
|
||||||
|
self.Ptr = posteriors
|
||||||
|
self.ytr = y
|
||||||
|
self.tr_tr_sums = tr_tr_sums
|
||||||
|
self.counts_inv = counts_inv
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
|
||||||
|
K,nD = Pte.shape
|
||||||
|
Kinv = (1/K)
|
||||||
|
h = self.bandwidth
|
||||||
|
n = Ptr.shape[1]
|
||||||
|
y = self.ytr
|
||||||
|
tr_tr_sums = self.tr_tr_sums
|
||||||
|
|
||||||
|
partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv)
|
||||||
|
|
||||||
|
tr_te_sums = np.zeros(shape=n, dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
|
||||||
|
|
||||||
|
def match(alpha):
|
||||||
|
partA = -np.log((alpha * tr_te_sums).sum())
|
||||||
|
alpha_l = alpha * self.counts_inv
|
||||||
|
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
|
||||||
|
return partA + partB + partC
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
||||||
# this variable controls that the mod sel has already been done, and skip this otherwise
|
# this variable controls that the mod sel has already been done, and skip this otherwise
|
||||||
semeval_trained = False
|
semeval_trained = False
|
||||||
|
|
||||||
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]:
|
||||||
print('init', dataset)
|
print('init', dataset)
|
||||||
|
|
||||||
local_result_path = global_result_path + '_' + dataset
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
|
|
@ -1,3 +1,17 @@
|
||||||
|
Change Log 0.1.8
|
||||||
|
----------------
|
||||||
|
|
||||||
|
- A conceptual error in MedianSweep and MedianSweep2 has been solved. The error consisted on computing all TPRs and
|
||||||
|
FPRs and report the median; then, the adjustment then operated on these single values. Instead, the original
|
||||||
|
method proposed by G.Forman comes down to generating all prevalence predictions, for all TPRs and FPRs, and then
|
||||||
|
computing the median of it.
|
||||||
|
|
||||||
|
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
|
||||||
|
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
|
||||||
|
|
||||||
|
- <fix> remove dependencies with LabelledCollection in the library.
|
||||||
|
|
||||||
|
|
||||||
Change Log 0.1.7
|
Change Log 0.1.7
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
|
|
@ -176,7 +176,7 @@ class NeuralClassifierTrainer:
|
||||||
self.classes_ = train.classes_
|
self.classes_ = train.classes_
|
||||||
opt = self.trainer_hyperparams
|
opt = self.trainer_hyperparams
|
||||||
checkpoint = self.checkpointpath
|
checkpoint = self.checkpointpath
|
||||||
self.reset_net_params(self.vocab_size, train.n_classes)
|
self.reset_net_params(self.vocab_size, train.arange_classes)
|
||||||
|
|
||||||
train_generator = TorchDataset(train.instances, train.labels).asDataloader(
|
train_generator = TorchDataset(train.instances, train.labels).asDataloader(
|
||||||
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
|
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
|
||||||
|
|
|
@ -269,6 +269,7 @@ class LabelledCollection:
|
||||||
test = self.sampling_from_index(right)
|
test = self.sampling_from_index(right)
|
||||||
return training, test
|
return training, test
|
||||||
|
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
"""
|
"""
|
||||||
Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
|
Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from copy import deepcopy
|
||||||
from typing import Union, Callable, Iterable
|
from typing import Union, Callable, Iterable
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -12,7 +13,8 @@ def prediction(
|
||||||
protocol: AbstractProtocol,
|
protocol: AbstractProtocol,
|
||||||
aggr_speedup: Union[str, bool] = 'auto',
|
aggr_speedup: Union[str, bool] = 'auto',
|
||||||
verbose=False,
|
verbose=False,
|
||||||
verbose_error=None):
|
verbose_error=None,
|
||||||
|
n_jobs=1):
|
||||||
"""
|
"""
|
||||||
Uses a quantification model to generate predictions for the samples generated via a specific protocol.
|
Uses a quantification model to generate predictions for the samples generated via a specific protocol.
|
||||||
This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
|
This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
|
||||||
|
@ -34,6 +36,10 @@ def prediction(
|
||||||
convenient or not. Set to False to deactivate.
|
convenient or not. Set to False to deactivate.
|
||||||
:param verbose: boolean, show or not information in stdout
|
:param verbose: boolean, show or not information in stdout
|
||||||
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
|
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
|
||||||
|
:param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
|
||||||
|
is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
|
||||||
|
N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
|
||||||
|
adding an overhead for cloning the objects within different threads that is often not worth the effort.
|
||||||
:return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
|
:return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
|
||||||
`(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
|
`(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
|
||||||
"""
|
"""
|
||||||
|
@ -63,21 +69,44 @@ def prediction(
|
||||||
if apply_optimization:
|
if apply_optimization:
|
||||||
pre_classified = model.classify(protocol.get_labelled_collection().instances)
|
pre_classified = model.classify(protocol.get_labelled_collection().instances)
|
||||||
protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
|
protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
|
||||||
return __prediction_helper(model.aggregate, protocol_with_predictions, verbose, verbose_error)
|
return __prediction_helper(model, protocol_with_predictions, True, verbose, verbose_error, n_jobs)
|
||||||
else:
|
else:
|
||||||
return __prediction_helper(model.quantify, protocol, verbose, verbose_error)
|
return __prediction_helper(model, protocol, False, verbose, verbose_error, n_jobs)
|
||||||
|
|
||||||
|
|
||||||
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False, verbose_error=None):
|
def __delayed_prediction(args):
|
||||||
|
quantifier, aggregate, sample_instances, sample_prev = args
|
||||||
|
quantifier = deepcopy(quantifier)
|
||||||
|
quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
|
||||||
|
predicted = quant_fn(sample_instances)
|
||||||
|
return sample_prev, predicted
|
||||||
|
|
||||||
|
|
||||||
|
def __prediction_helper(quantifier, protocol: AbstractProtocol, aggregate: bool, verbose=False, verbose_error=None, n_jobs=1):
|
||||||
true_prevs, estim_prevs = [], []
|
true_prevs, estim_prevs = [], []
|
||||||
|
ongoing_errors = []
|
||||||
if verbose:
|
if verbose:
|
||||||
pbar = tqdm(protocol(), total=protocol.total(), desc='predicting')
|
pbar = tqdm(protocol(), total=protocol.total(), desc='predicting')
|
||||||
for sample_instances, sample_prev in pbar if verbose else protocol():
|
if n_jobs==1:
|
||||||
estim_prevs.append(quantification_fn(sample_instances))
|
quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
|
||||||
true_prevs.append(sample_prev)
|
for sample_instances, sample_prev in pbar if verbose else protocol():
|
||||||
if verbose and verbose_error is not None:
|
predicted = quant_fn(sample_instances)
|
||||||
err = verbose_error(true_prevs, estim_prevs)
|
estim_prevs.append(predicted)
|
||||||
pbar.set_description('predicting: ongoing error={err:.5f}')
|
true_prevs.append(sample_prev)
|
||||||
|
if verbose and verbose_error is not None:
|
||||||
|
err = verbose_error(sample_prev, predicted)
|
||||||
|
ongoing_errors.append(err)
|
||||||
|
pbar.set_description(f'predicting: ongoing error={np.mean(ongoing_errors):.5f}')
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
print('parallelizing prediction')
|
||||||
|
outputs = qp.util.parallel(
|
||||||
|
__delayed_prediction,
|
||||||
|
((quantifier, aggregate, sample_X, sample_p) for (sample_X, sample_p) in (pbar if verbose else protocol())),
|
||||||
|
seed=qp.environ.get('_R_SEED', None),
|
||||||
|
n_jobs=n_jobs
|
||||||
|
)
|
||||||
|
true_prevs, estim_prevs = list(zip(*outputs))
|
||||||
|
|
||||||
true_prevs = np.asarray(true_prevs)
|
true_prevs = np.asarray(true_prevs)
|
||||||
estim_prevs = np.asarray(estim_prevs)
|
estim_prevs = np.asarray(estim_prevs)
|
||||||
|
@ -89,7 +118,7 @@ def evaluation_report(model: BaseQuantifier,
|
||||||
protocol: AbstractProtocol,
|
protocol: AbstractProtocol,
|
||||||
error_metrics: Iterable[Union[str,Callable]] = 'mae',
|
error_metrics: Iterable[Union[str,Callable]] = 'mae',
|
||||||
aggr_speedup: Union[str, bool] = 'auto',
|
aggr_speedup: Union[str, bool] = 'auto',
|
||||||
verbose=False):
|
verbose=False, verbose_error=None, n_jobs=1):
|
||||||
"""
|
"""
|
||||||
Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
|
Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
|
||||||
to a specific protocol and in terms of one or more evaluation metrics (errors).
|
to a specific protocol and in terms of one or more evaluation metrics (errors).
|
||||||
|
@ -107,12 +136,19 @@ def evaluation_report(model: BaseQuantifier,
|
||||||
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
|
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
|
||||||
convenient or not. Set to False to deactivate.
|
convenient or not. Set to False to deactivate.
|
||||||
:param verbose: boolean, show or not information in stdout
|
:param verbose: boolean, show or not information in stdout
|
||||||
|
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
|
||||||
|
:param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
|
||||||
|
is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
|
||||||
|
N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
|
||||||
|
adding an overhead for cloning the objects within different threads that is often not worth the effort.
|
||||||
:return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
|
:return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
|
||||||
'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
|
'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
|
||||||
have been indicated, each displaying the score in terms of that metric for every sample.
|
have been indicated, each displaying the score in terms of that metric for every sample.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
|
true_prevs, estim_prevs = prediction(
|
||||||
|
model, protocol, aggr_speedup=aggr_speedup, verbose=verbose, verbose_error=verbose_error, n_jobs=n_jobs
|
||||||
|
)
|
||||||
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
|
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
|
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
|
||||||
>>>
|
>>>
|
||||||
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
|
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
|
||||||
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
|
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.arange_classes)
|
||||||
>>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
|
>>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
|
||||||
>>>
|
>>>
|
||||||
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
|
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
|
||||||
|
|
|
@ -54,6 +54,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
self.__check_error(error)
|
self.__check_error(error)
|
||||||
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
|
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
|
||||||
|
|
||||||
|
|
||||||
def _sout(self, msg):
|
def _sout(self, msg):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
|
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
|
||||||
|
|
Loading…
Reference in New Issue