trying pca reduction for cifar 100

This commit is contained in:
Alejandro Moreo Fernandez 2026-01-31 00:11:19 +01:00
parent 877bfb2b18
commit 81472b9d25
9 changed files with 199 additions and 156 deletions

View File

@ -1,5 +1,6 @@
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import numpy as np import numpy as np
from sklearn.decomposition import PCA
from BayesianKDEy.commons import ILRtransformation, in_simplex from BayesianKDEy.commons import ILRtransformation, in_simplex
from quapy.method._kdey import KDEBase from quapy.method._kdey import KDEBase
@ -60,6 +61,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
temperature=1., temperature=1.,
engine='numpyro', engine='numpyro',
prior='uniform', prior='uniform',
reduce=None,
verbose: bool = False, verbose: bool = False,
**kwargs): **kwargs):
@ -91,13 +93,22 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
self.temperature = temperature self.temperature = temperature
self.engine = engine self.engine = engine
self.prior = prior self.prior = prior
self.reduce = reduce
self.verbose = verbose self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels): def aggregation_fit(self, classif_predictions, labels):
if self.reduce is not None:
self.pca = PCA(n_components=self.reduce)
classif_predictions = self.pca.fit_transform(classif_predictions)
#print(f'reduce to ', classif_predictions.shape)
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel) self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
#print('num mix ', len(self.mix_densities))
return self return self
def aggregate(self, classif_predictions: np.ndarray): def aggregate(self, classif_predictions: np.ndarray):
if hasattr(self, 'pca'):
classif_predictions = self.pca.transform(classif_predictions)
if self.engine == 'rw-mh': if self.engine == 'rw-mh':
if self.prior != 'uniform': if self.prior != 'uniform':
raise RuntimeError('prior is not yet implemented in rw-mh') raise RuntimeError('prior is not yet implemented in rw-mh')
@ -245,6 +256,7 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
return samples return samples
def _bayesian_numpyro(self, X_probs): def _bayesian_numpyro(self, X_probs):
#print("bayesian_numpyro", X_probs.shape)
kdes = self.mix_densities kdes = self.mix_densities
# test_densities = np.asarray( # test_densities = np.asarray(
# [self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes] # [self.pdf(kde_i, X_probs, self.kernel) for kde_i in kdes]
@ -252,12 +264,14 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
test_log_densities = np.asarray( test_log_densities = np.asarray(
[self.pdf(kde_i, X_probs, self.kernel, log_densities=True) for kde_i in kdes] [self.pdf(kde_i, X_probs, self.kernel, log_densities=True) for kde_i in kdes]
) )
print(f'min={np.min(test_log_densities)}') #print(f'min={np.min(test_log_densities)}')
print(f'max={np.max(test_log_densities)}') #print(f'max={np.max(test_log_densities)}')
#print("bayesian_numpyro", test_log_densities.shape)
#print("len kdes ", len(kdes))
# import sys # import sys
# sys.exit(0) # sys.exit(0)
n_classes = X_probs.shape[-1] n_classes = len(kdes)
if isinstance(self.prior, str) and self.prior == 'uniform': if isinstance(self.prior, str) and self.prior == 'uniform':
alpha = [1.] * n_classes alpha = [1.] * n_classes
else: else:

View File

@ -0,0 +1,78 @@
from sklearn.neighbors import KernelDensity
import quapy.functional as F
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
import quapy.functional as F
# aitchison=True
aitchison=False
clr = F.CLRtransformation()
# h = 0.1
# dims = list(range(5, 100, 5))
dims = [10, 28, 100]
center_densities = []
vertex_densities = []
center_densities_scaled = []
vertex_densities_scaled = []
for n in dims:
h0 = 0.4
simplex_center = F.uniform_prevalence(n)
simplex_vertex = np.asarray([.9] + [.1/ (n - 1)] * (n - 1), dtype=float)
# KDE trained on a single point (the center)
kde = KernelDensity(bandwidth=h0)
X = simplex_center[None, :]
if aitchison:
X = clr(X)
kde.fit(X)
X = np.vstack([simplex_center, simplex_vertex])
if aitchison:
X = clr(X)
density = np.exp(kde.score_samples(X))
center_densities.append(density[0])
vertex_densities.append(density[1])
h1= h0 * np.sqrt(n / 2)
# KDE trained on a single point (the center)
kde = KernelDensity(bandwidth=h1)
X = simplex_center[None, :]
if aitchison:
X = clr(X)
kde.fit(X)
X = np.vstack([simplex_center, simplex_vertex])
if aitchison:
X = clr(X)
density = np.exp(kde.score_samples(X))
center_densities_scaled.append(density[0])
vertex_densities_scaled.append(density[1])
# Plot
plt.figure(figsize=(6*4, 4*4))
plt.plot(dims, center_densities, marker='o', label='Center of simplex')
plt.plot(dims, vertex_densities, marker='s', label='Vertex of simplex')
plt.plot(dims, center_densities_scaled, marker='o', label='Center of simplex (scaled)')
plt.plot(dims, vertex_densities_scaled, marker='s', label='Vertex of simplex (scaled)')
plt.xlabel('Number of classes (simplex dimension)')
# plt.ylim(min(center_densities+vertex_densities), max(center_densities+vertex_densities))
plt.ylabel('Kernel density')
plt.yscale('log') # crucial to see anything meaningful
plt.title(f'KDE density vs dimension (bandwidth = {h0}) in {"Simplex" if not aitchison else "ILR-space"}')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

View File

@ -4,6 +4,7 @@ from pathlib import Path
from jax import numpy as jnp from jax import numpy as jnp
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA
import quapy.functional as F import quapy.functional as F
@ -42,6 +43,57 @@ def antagonistic_prevalence(p, strength=1):
return p_ant return p_ant
"""
class KDEyScaledB(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='gaussian'
)
def aggregation_fit(self, classif_predictions, labels):
if not hasattr(self, '_changed'):
def scale_bandwidth(n_classes, beta=0.5):
return self.bandwidth * np.power(n_classes, beta)
n_classes = len(set(y))
scaled = scale_bandwidth(n_classes)
print(f'bandwidth scaling: {self.bandwidth:.4f} => {scaled:.4f}')
self.bandwidth = scaled
self._changed = True
return super().aggregation_fit(classif_predictions, labels)
"""
class KDEyScaledB(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='gaussian'
)
class KDEyFresh(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='gaussian'
)
class KDEyReduce(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., n_components=10, random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='gaussian'
)
self.n_components=n_components
def aggregation_fit(self, classif_predictions, labels):
self.pca = PCA(n_components=self.n_components)
classif_predictions = self.pca.fit_transform(classif_predictions)
return super().aggregation_fit(classif_predictions, labels)
def aggregate(self, posteriors: np.ndarray):
posteriors = self.pca.transform(posteriors)
return super().aggregate(posteriors)
class KDEyCLR(KDEyML): class KDEyCLR(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__( super().__init__(

View File

@ -3,9 +3,10 @@ from pathlib import Path
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from copy import deepcopy as cp from copy import deepcopy as cp
import quapy as qp import quapy as qp
from BayesianKDEy.commons import KDEyReduce
from _bayeisan_kdey import BayesianKDEy from _bayeisan_kdey import BayesianKDEy
from _bayesian_mapls import BayesianMAPLS from _bayesian_mapls import BayesianMAPLS
from commons import experiment_path, KDEyCLR, RESULT_DIR, MockClassifierFromPosteriors from commons import experiment_path, KDEyCLR, RESULT_DIR, MockClassifierFromPosteriors, KDEyScaledB, KDEyFresh
# import datasets # import datasets
from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, VisualDataHandler, CIFAR100Handler from datasets import LeQuaHandler, UCIMulticlassHandler, DatasetHandler, VisualDataHandler, CIFAR100Handler
from temperature_calibration import temp_calibration from temperature_calibration import temp_calibration
@ -55,6 +56,9 @@ def methods(data_handler: DatasetHandler):
acc = ACC(Cls(), val_split=val_split) acc = ACC(Cls(), val_split=val_split)
hdy = DMy(Cls(), val_split=val_split) hdy = DMy(Cls(), val_split=val_split)
kde_gau = KDEyML(Cls(), val_split=val_split) kde_gau = KDEyML(Cls(), val_split=val_split)
kde_gau_scale = KDEyScaledB(Cls(), val_split=val_split)
kde_gau_pca = KDEyReduce(Cls(), val_split=val_split, n_components=5)
kde_gau_pca10 = KDEyReduce(Cls(), val_split=val_split, n_components=10)
kde_ait = KDEyCLR(Cls(), val_split=val_split) kde_ait = KDEyCLR(Cls(), val_split=val_split)
emq = EMQ(Cls(), exact_train_prev=False, val_split=val_split) emq = EMQ(Cls(), exact_train_prev=False, val_split=val_split)
@ -71,15 +75,33 @@ def methods(data_handler: DatasetHandler):
# yield 'BayesianACC', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, mcmc_seed=0), multiclass_method # yield 'BayesianACC', acc, acc_hyper, lambda hyper: BayesianCC(Cls(), val_split=val_split, mcmc_seed=0), multiclass_method
#yield 'BayesianHDy', hdy, hdy_hyper, lambda hyper: PQ(Cls(), val_split=val_split, stan_seed=0, **hyper), only_binary #yield 'BayesianHDy', hdy, hdy_hyper, lambda hyper: PQ(Cls(), val_split=val_split, stan_seed=0, **hyper), only_binary
# yield f'BaKDE-Ait-numpyro', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(), kernel='aitchison', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method # yield f'BaKDE-Ait-numpyro', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(), kernel='aitchison', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
yield f'BaKDE-Gau-numpyro', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-numpyro', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
#yield f'BaKDE-Gau-scale', kde_gau_scale, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
yield f'BaKDE-Gau-pca5', kde_gau_pca, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), reduce=5, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
yield f'BaKDE-Gau-pca5*', kde_gau_pca, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), reduce=5, temperature=None, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
yield f'BaKDE-Gau-pca10', kde_gau_pca10, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), reduce=10, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
yield f'BaKDE-Gau-pca10*', kde_gau_pca10, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), reduce=10, temperature=None, kernel='gaussian', mcmc_seed=0, engine='numpyro', val_split=val_split, **hyper), multiclass_method
# yield f'BaKDE-Gau-H0', KDEyFresh(Cls(), bandwidth=0.4), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=0.4, kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
# yield f'BaKDE-Gau-H1', KDEyFresh(Cls(), bandwidth=1.), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=1., kernel='gaussian', mcmc_seed=0, engine='numpyro', **hyper), multiclass_method
# yield f'BaKDE-Gau-H2', KDEyFresh(Cls(), bandwidth=1.5), cls_hyper, lambda hyper: BayesianKDEy(Cls(), bandwidth=1.5,
# kernel='gaussian',
# mcmc_seed=0,
# engine='numpyro',
# **hyper), multiclass_method
# yield f'BaKDE-Ait-T*', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method # yield f'BaKDE-Ait-T*', kde_ait, kdey_hyper_clr, lambda hyper: BayesianKDEy(Cls(),kernel='aitchison', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method
yield f'BaKDE-Gau-T*', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method #yield f'BaKDE-Gau-T*', kde_gau, kdey_hyper, lambda hyper: BayesianKDEy(Cls(), kernel='gaussian', mcmc_seed=0, engine='numpyro', temperature=None, val_split=val_split, **hyper), multiclass_method
# yield 'BayEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BayEMQ', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=1, exact_train_prev=False, val_split=val_split), multiclass_method
# yield 'BayEMQ*', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method # yield 'BayEMQ*', emq, acc_hyper, lambda hyper: BayesianMAPLS(Cls(), prior='uniform', temperature=None, exact_train_prev=False, val_split=val_split), multiclass_method
def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict): def model_selection(dataset: DatasetHandler, point_quantifier: AggregativeQuantifier, grid: dict):
with qp.util.temp_seed(0): with qp.util.temp_seed(0):
if isinstance(point_quantifier, KDEyScaledB) and 'bandwidth' in grid:
def scale_bandwidth(bandwidth, n_classes, beta=0.5):
return bandwidth * np.power(n_classes, beta)
n = dataset.get_training().n_classes
grid['bandwidth'] = [scale_bandwidth(b, n) for b in grid['bandwidth']]
print('bandwidth scaled')
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
# model selection # model selection
if len(grid)>0: if len(grid)>0:
@ -108,8 +130,8 @@ def temperature_calibration(dataset: DatasetHandler, uncertainty_quantifier):
if dataset.name.startswith('LeQua'): if dataset.name.startswith('LeQua'):
temp_grid=[100., 500, 1000, 5_000, 10_000, 50_000] temp_grid=[100., 500, 1000, 5_000, 10_000, 50_000]
else: else:
temp_grid=[.5, 1., 1.5, 2., 5., 10., 100.] temp_grid=[.5, 1., 1.5, 2., 5., 10., 100., 1000.]
temperature = temp_calibration(uncertainty_quantifier, train, val_prot, temp_grid=temp_grid, n_jobs=-1) temperature = temp_calibration(uncertainty_quantifier, train, val_prot, temp_grid=temp_grid, n_jobs=-1, amplitude_threshold=.999)
uncertainty_quantifier.temperature = temperature uncertainty_quantifier.temperature = temperature
else: else:
temperature = uncertainty_quantifier.temperature temperature = uncertainty_quantifier.temperature
@ -179,10 +201,12 @@ if __name__ == '__main__':
result_dir = RESULT_DIR result_dir = RESULT_DIR
for data_handler in [CIFAR100Handler, VisualDataHandler]:#, UCIMulticlassHandler,LeQuaHandler]: for data_handler in [CIFAR100Handler]:#, UCIMulticlassHandler,LeQuaHandler, VisualDataHandler, CIFAR100Handler]:
for dataset in data_handler.iter(): for dataset in data_handler.iter():
qp.environ['SAMPLE_SIZE'] = dataset.sample_size qp.environ['SAMPLE_SIZE'] = dataset.sample_size
print(f'dataset={dataset.name}') print(f'dataset={dataset.name}')
#if dataset.name != 'abalone':
# continue
problem_type = 'binary' if dataset.is_binary() else 'multiclass' problem_type = 'binary' if dataset.is_binary() else 'multiclass'

View File

@ -9,7 +9,7 @@ from pathlib import Path
import quapy as qp import quapy as qp
from BayesianKDEy.commons import RESULT_DIR from BayesianKDEy.commons import RESULT_DIR
from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, VisualDataHandler, CIFAR100Handler from BayesianKDEy.datasets import LeQuaHandler, UCIMulticlassHandler, VisualDataHandler, CIFAR100Handler
from error import dist_aitchison from quapy.error import dist_aitchison
from quapy.method.confidence import ConfidenceIntervals from quapy.method.confidence import ConfidenceIntervals
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR, ConfidenceEllipseILR, ConfidenceIntervals, ConfidenceRegionABC
import quapy.functional as F import quapy.functional as F
@ -27,7 +27,7 @@ methods = ['BayesianACC',
'BaKDE-Ait-numpyro', 'BaKDE-Ait-numpyro',
'BaKDE-Ait-T*', 'BaKDE-Ait-T*',
'BaKDE-Gau-numpyro', 'BaKDE-Gau-numpyro',
'BaKDE-Gau-T*', 'BaKDE-Gau-T*', 'BaKDE-Gau-pca5', 'BaKDE-Gau-pca5*', 'BaKDE-Gau-pca10', 'BaKDE-Gau-pca10*',
# 'BayEMQ-U-Temp1-2', # 'BayEMQ-U-Temp1-2',
# 'BayEMQ-T*', # 'BayEMQ-T*',
'BayEMQ', 'BayEMQ',

View File

@ -1,134 +0,0 @@
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KernelDensity
from scipy.special import logsumexp
from sklearn.model_selection import StratifiedKFold, cross_val_predict
def class_scale_factors(X, y):
lambdas = {}
scales = []
for c in np.unique(y):
Xc = X[y == c]
cov = np.cov(Xc.T)
scale = np.trace(cov)
lambdas[c] = scale
scales.append(scale)
mean_scale = np.mean(scales)
for c in lambdas:
lambdas[c] /= mean_scale
return lambdas
class ClassConditionalKDE:
def __init__(self, kernel="gaussian", lambdas=None):
self.kernel = kernel
self.lambdas = lambdas or {}
self.models = {}
def fit(self, X, y, bandwidth):
self.classes_ = np.unique(y)
for c in self.classes_:
h_c = bandwidth * self.lambdas.get(c, 1.0)
kde = KernelDensity(kernel=self.kernel, bandwidth=h_c)
kde.fit(X[y == c])
self.models[c] = kde
return self
def log_density(self, X):
logp = np.column_stack([
self.models[c].score_samples(X)
for c in self.classes_
])
return logp
def conditional_log_likelihood(logp, y, priors=None):
if priors is None:
priors = np.ones(logp.shape[1]) / logp.shape[1]
log_prior = np.log(priors)
log_joint = logp + log_prior
denom = logsumexp(log_joint, axis=1)
num = log_joint[np.arange(len(y)), y]
return np.sum(num - denom)
def cv_objective(
bandwidth,
X,
y,
lambdas,
kernel="gaussian",
n_splits=5,
):
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
score = 0.0
for train, test in skf.split(X, y):
model = ClassConditionalKDE(kernel=kernel, lambdas=lambdas)
model.fit(X[train], y[train], bandwidth)
logp = model.log_density(X[test])
score += conditional_log_likelihood(logp, y[test])
return score
if __name__ == '__main__':
from BayesianKDEy.datasets import UCIMulticlassHandler
from quapy.method.aggregative import KDEyML
from quapy.model_selection import GridSearchQ
from quapy.evaluation import evaluation_report
dataset = UCIMulticlassHandler('academic-success')
training = dataset.get_training()
X, y = training.Xy
cls = LogisticRegression()
P = cross_val_predict(cls, X, y, cv=5, n_jobs=-1, method='predict_proba')
bandwidths = np.logspace(-3, 0, 50)
lambdas=None
scores = [
cv_objective(h, P, y, lambdas)
for h in bandwidths
]
best_h = bandwidths[np.argmax(scores)]
print(best_h)
cls = LogisticRegression()
kdey = KDEyML(cls, val_split=5, random_state=0)
train, val_prot = dataset.get_train_valprot_for_modsel()
modsel = GridSearchQ(kdey, param_grid={'bandwidth': bandwidths}, protocol=val_prot, n_jobs=-1, verbose=True)
modsel.fit(*train.Xy)
best_bandwidth = modsel.best_params_['bandwidth']
print(best_bandwidth)
print(f'First experiment, with bandwidth={best_h:.4f}')
cls = LogisticRegression()
kdey=KDEyML(cls, val_split=5, random_state=0, bandwidth=best_h)
train, test_prot = dataset.get_train_testprot_for_eval()
kdey.fit(*train.Xy)
report = evaluation_report(kdey, test_prot, error_metrics=['ae'])
print(report.mean(numeric_only=True))
print(f'Second experiment, with bandwidth={best_bandwidth:.4f}')
cls = LogisticRegression()
kdey=KDEyML(cls, val_split=5, random_state=0, bandwidth=best_bandwidth)
# train, test_prot = dataset.get_train_testprot_for_eval()
kdey.fit(*train.Xy)
report = evaluation_report(kdey, test_prot, error_metrics=['ae'])
print(report.mean(numeric_only=True))

View File

@ -9,6 +9,7 @@ Change Log 0.2.1
- Added ReadMe method by Daniel Hopkins and Gary King - Added ReadMe method by Daniel Hopkins and Gary King
- Internal index in LabelledCollection is now "lazy", and is only constructed if required. - Internal index in LabelledCollection is now "lazy", and is only constructed if required.
- Added dist_aitchison and mean_dist_aitchison as a new error evaluation metric. - Added dist_aitchison and mean_dist_aitchison as a new error evaluation metric.
- Improved numerical stability of KDEyML through logsumexp; useful for cases with large number of classes, where densities for small bandwidths may become huge
Change Log 0.2.0 Change Log 0.2.0
----------------- -----------------

View File

@ -47,13 +47,16 @@ Para quitar el labelledcollection de los métodos:
- fit_classifier=False: - fit_classifier=False:
- [TODO] check if the KDEyML variant with sumlogexp is slower than the original one, or check whether we can explore
an unconstrained space in which the parameter is already the log(prev); maybe also move to cvxq
- [TODO] why not simplifying the epsilon of RAE? at the end, it is meant to smooth the denominator for avoiding div 0
- [TODO] document confidence in manuals - [TODO] document confidence in manuals
- [TODO] Test the return_type="index" in protocols and finish the "distributing_samples.py" example - [TODO] Test the return_type="index" in protocols and finish the "distributing_samples.py" example
- [TODO] Add EDy (an implementation is available at quantificationlib) - [TODO] Add EDy (an implementation is available at quantificationlib)
- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ - [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
- [TODO] add HistNetQ - [TODO] add HistNetQ
- [TODO] add CDE-iteration and Bayes-CDE methods - [TODO] add CDE-iteration (https://github.com/Arctickirillas/Rubrication/blob/master/quantification.py#L593 or
Schumacher's code) and Bayes-CDE methods
- [TODO] add Friedman's method and DeBias - [TODO] add Friedman's method and DeBias
- [TODO] check ignore warning stuff - [TODO] check ignore warning stuff
check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings

View File

@ -5,7 +5,7 @@ from sklearn.neighbors import KernelDensity
import quapy as qp import quapy as qp
from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.method.aggregative import AggregativeSoftQuantifier
import quapy.functional as F import quapy.functional as F
from scipy.special import logsumexp
from sklearn.metrics.pairwise import rbf_kernel from sklearn.metrics.pairwise import rbf_kernel
@ -29,9 +29,10 @@ class KDEBase:
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float): if isinstance(bandwidth, float):
assert kernel!='gaussian' or (0 < bandwidth < 1), \ #assert kernel!='gaussian' or (0 < bandwidth < 1), \
("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), " # ("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), "
"since this method models the unit simplex") # "since this method models the unit simplex")
pass
return bandwidth return bandwidth
@classmethod @classmethod
@ -175,14 +176,18 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
:return: a vector of class prevalence estimates :return: a vector of class prevalence estimates
""" """
with qp.util.temp_seed(self.random_state): with qp.util.temp_seed(self.random_state):
epsilon = 1e-10 epsilon = 1e-12
n_classes = len(self.mix_densities) n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities] #test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
test_log_densities = [self.pdf(kde_i, posteriors, self.kernel, log_densities=True) for kde_i in self.mix_densities]
#def neg_loglikelihood(prev):
# prev = np.clip(prev, epsilon, 1.0)
# test_mixture_likelihood = prev @ test_densities
# test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
# return -np.sum(test_loglikelihood)
def neg_loglikelihood(prev): def neg_loglikelihood(prev):
# test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) test_loglikelihood = logsumexp(np.log(np.clip(prev, epsilon, 1.0))[:,None] + test_log_densities, axis=0)
test_mixture_likelihood = prev @ test_densities
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood) return -np.sum(test_loglikelihood)
return F.optim_minimize(neg_loglikelihood, n_classes) return F.optim_minimize(neg_loglikelihood, n_classes)