QuaPy/KDEy/kdey_devel.py

172 lines
8.8 KiB
Python

from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML
import quapy.functional as F
from sklearn.metrics.pairwise import rbf_kernel
from scipy import optimize
class KDEyMLauto(KDEyML):
def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = None
self.random_state = random_state
self.optim = optim
def chose_bandwidth(self, train, test_instances):
classif_predictions = self.classifier_fit_predict(train, fit_classifier=True, predict_on=self.val_split)
te_posteriors = self.classify(test_instances)
return self.transduce(classif_predictions, te_posteriors)
def transduce(self, classif_predictions, te_posteriors):
tr_posteriors, tr_y = classif_predictions.Xy
classes = classif_predictions.classes_
n_classes = len(classes)
current_bandwidth = 0.05
if self.optim == 'both_fine':
current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
iterations = 0
convergence = False
with qp.util.temp_seed(self.random_state):
while not convergence:
previous_bandwidth = current_bandwidth
previous_prevalence = current_prevalence
iterations += 1
print(f'{iterations}:')
if self.optim == 'two_steps':
current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\testim-prev={F.strprev(current_prevalence)}')
current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\tbandwidth={current_bandwidth}')
if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(
np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
elif self.optim == 'both':
current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
elif self.optim == 'both_fine':
current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y,
te_posteriors, classes)
if all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
self.bandwidth = current_bandwidth
print('bandwidth=', current_bandwidth)
print('prevalence=', current_prevalence)
return current_prevalence
def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return optim_minimize(neg_loglikelihood_prev, current_prev)
def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
def neg_loglikelihood_bandwidth(bandwidth):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0])
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(current_prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0.00001, 1)]
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
print(f'iterations-bandwidth={r.nit}')
return r.x[0]
def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
n_classes = len(current_prev)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
bandwidth = prevalence_bandwidth[-1]
prevalence = prevalence_bandwidth[:-1]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.append(current_prev, current_bandwidth)
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both={r.nit}')
prev_band = r.x
current_prevalence = prev_band[:-1]
current_bandwidth = prev_band[-1]
return current_prevalence, current_bandwidth
def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
n_classes = len(current_bandwidth)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
prevalence = prevalence_bandwidth[:n_classes]
bandwidth = prevalence_bandwidth[n_classes:]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both-fine={r.nit}')
prev_band = r.x
current_prevalence = prev_band[:n_classes]
current_bandwidth = prev_band[n_classes:]
return current_prevalence, current_bandwidth
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.classif_predictions = classif_predictions
return self
def aggregate(self, posteriors: np.ndarray):
return self.transduce(self.classif_predictions, posteriors)
def optim_minimize(loss: Callable, init_prev: np.ndarray):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
SLSQP routine.
:param loss: (callable) the function to minimize
:return: (ndarray) the best prevalence vector found
"""
n_classes = len(init_prev)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-prevalence={r.nit}')
return r.x