From 8876d311e784c0b6fae9af8ebb20300b0a718b8c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 2 Dec 2024 17:39:06 +0100 Subject: [PATCH] working on SCMQ, MCSQ, MCMQ ensembles --- examples/16.KDEy_bandwidth.py | 83 +++++++++++++++++++++++++++++++++++ examples/ensembles.py | 26 +++++++++-- quapy/functional.py | 14 ++++-- quapy/method/_kdey.py | 63 ++++++++++++++++++++++++-- quapy/method/meta.py | 75 ++++++++++++++++++++++++------- quapy/model_selection.py | 4 +- 6 files changed, 236 insertions(+), 29 deletions(-) create mode 100644 examples/16.KDEy_bandwidth.py diff --git a/examples/16.KDEy_bandwidth.py b/examples/16.KDEy_bandwidth.py new file mode 100644 index 0000000..cc81ade --- /dev/null +++ b/examples/16.KDEy_bandwidth.py @@ -0,0 +1,83 @@ +import quapy as qp +import numpy as np +from quapy.protocol import UPP +from quapy.method.aggregative import KDEyML +import quapy.functional as F +from time import time + +""" +Let see one example: +""" + +# load some data +qp.environ['SAMPLE_SIZE'] = 100 +data = qp.datasets.fetch_UCIMulticlassDataset('molecular') +training, test = data.train_test +training, validation = training.split_stratified(train_prop=0.7, random_state=0) +protocol = UPP(validation) + +hyper_C = np.logspace(-3, 3, 7) + +model = KDEyML() + +with qp.util.temp_seed(0): + + param_grid = { + 'classifier__C': hyper_C, + 'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20] + } + + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + n_jobs=-1, + verbose=True # show information as the process goes on + ).fit(training) + +best_params = model.best_params_ +took = model.fit_time_ +model = model.best_model_ +print(f'model selection ended: best hyper-parameters={best_params}') + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {took:.1f}s') + + +model = KDEyML(bandwidth='auto') + +with qp.util.temp_seed(0): + + param_grid = { + 'classifier__C': hyper_C, + } + + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + n_jobs=-1, + verbose=True # show information as the process goes on + ).fit(training) + +best_params = model.best_params_ +took = model.fit_time_ +model = model.best_model_ +bandwidth = model.bandwidth_val +print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})') + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {took:.1f}s') + diff --git a/examples/ensembles.py b/examples/ensembles.py index 8c0d07e..84aeb2c 100644 --- a/examples/ensembles.py +++ b/examples/ensembles.py @@ -1,10 +1,16 @@ +from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import MultinomialNB +from sklearn.neighbors import KNeighborsClassifier from statsmodels.sandbox.distributions.genpareto import quant import quapy as qp from quapy.protocol import UPP from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML -from quapy.method.meta import SCMQ +from quapy.method.meta import SCMQ, MCMQ, MCSQ +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=ConvergenceWarning) qp.environ["SAMPLE_SIZE"]=100 @@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers) train_and_test_model(scmq, train, test) -for quantifier in quantifiers: - train_and_test_model(quantifier, train, test) \ No newline at end of file +# for quantifier in quantifiers: +# train_and_test_model(quantifier, train, test) + +classifiers = [ + LogisticRegression(), + KNeighborsClassifier(), + # MultinomialNB() +] + +mcmq = MCMQ(classifiers, quantifiers) + +train_and_test_model(mcmq, train, test) + +mcsq = MCSQ(classifiers, PACC()) + +train_and_test_model(mcsq, train, test) \ No newline at end of file diff --git a/quapy/functional.py b/quapy/functional.py index fd7a88f..fac6647 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -416,7 +416,7 @@ def argmin_prevalence(loss: Callable, raise NotImplementedError() -def optim_minimize(loss: Callable, n_classes: int): +def optim_minimize(loss: Callable, n_classes: int, return_loss=False): """ Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's @@ -424,18 +424,24 @@ def optim_minimize(loss: Callable, n_classes: int): :param loss: (callable) the function to minimize :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector - :return: (ndarray) the best prevalence vector found + :param return_loss: bool, if True, returns also the value of the loss (default is False). + :return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss + if return_loss=True """ from scipy import optimize # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + uniform_distribution = uniform_prevalence(n_classes=n_classes) # solutions are bounded to those contained in the unit-simplex bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - return r.x + + if return_loss: + return r.x, r.fun + else: + return r.x def linear_search(loss: Callable, n_classes: int): diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index e50c99c..a396324 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -1,5 +1,8 @@ from typing import Union import numpy as np +from scipy.optimize import optimize, minimize_scalar + +from quapy.protocol import UPP from sklearn.base import BaseEstimator from sklearn.neighbors import KernelDensity @@ -111,16 +114,70 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): :param random_state: a seed to be set before fitting any base quantifier (default None) """ - def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None): + def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None): self.classifier = qp._get_classifier(classifier) self.val_split = val_split - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = bandwidth + if bandwidth!='auto': + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + + assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \ + (f'param {auto_reduction=} should either be None (no reduction) or a positive integer ' + f'(number of training instances).') + + self.auto_reduction = auto_reduction + self.auto_repeats = auto_repeats self.random_state=random_state def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): - self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) + if self.bandwidth == 'auto': + self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions) + else: + self.bandwidth_val = self.bandwidth + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val) return self + def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection): + train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state) + n_classes = classif_predictions.n_classes + epsilon = 1e-8 + repeats = self.auto_repeats + + auto_reduction = self.auto_reduction + if auto_reduction is None: + auto_reduction = len(classif_predictions) + else: + # reduce samples to speed up computation + train = train.sampling(auto_reduction) + + prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state) + + def eval_bandwidth_nll(bandwidth): + mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth) + loss_accum = 0 + for (sample, prevtrue) in prot(): + test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities] + + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + nll = -np.sum(test_loglikelihood) + return nll + + pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True) + loss_accum += neglikelihood + return loss_accum + + r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005}) + best_band = r.x + best_loss_value = r.fun + nit = r.nit + + # print(f'[{self.__class__.__name__}:autobandwidth] ' + # f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})') + + return best_band + def aggregate(self, posteriors: np.ndarray): """ Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 0fc8916..05dd06c 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -693,14 +693,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs) +def merge(prev_predictions, merge_fun): + prev_predictions = np.asarray(prev_predictions) + if merge_fun == 'median': + prevalences = np.median(prev_predictions, axis=0) + prevalences = F.normalize_prevalence(prevalences, method='l1') + elif merge_fun == 'mean': + prevalences = np.mean(prev_predictions, axis=0) + else: + raise NotImplementedError(f'merge function {merge_fun} not implemented!') + return prevalences + + class SCMQ(AggregativeSoftQuantifier): - MERGE_FUNCTIONS = ['median'] + MERGE_FUNCTIONS = ['median', 'mean'] def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5): self.classifier = classifier - self.quantifiers = quantifiers - assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}' + self.quantifiers = [deepcopy(q) for q in quantifiers] + assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}' self.merge_fun = merge_fun self.val_split = val_split @@ -715,22 +727,51 @@ class SCMQ(AggregativeSoftQuantifier): for quantifier_i in self.quantifiers: prevalence_i = quantifier_i.aggregate(classif_predictions) prev_predictions.append(prevalence_i) - return self.merge(prev_predictions) - - def merge(self, prev_predictions): - prev_predictions = np.asarray(prev_predictions) - if self.merge_fun == 'median': - prevalences = np.median(prev_predictions, axis=0) - prevalences = F.normalize_prevalence(prevalences, method='l1') - elif self.merge_fun == 'mean': - prevalences = np.mean(prev_predictions, axis=0) - else: - raise NotImplementedError(f'merge function {self.merge_fun} not implemented!') - return prevalences - - + return merge(prev_predictions, merge_fun=self.merge_fun) +class MCSQ(BaseQuantifier): + def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5): + self.merge_fun = merge_fun + self.val_split = val_split + self.mcsqs = [] + for classifier in classifiers: + quantifier = deepcopy(quantifier) + quantifier.classifier = classifier + self.mcsqs.append(quantifier) + + def fit(self, data: LabelledCollection): + for q in self.mcsqs: + q.fit(data, val_split=self.val_split) + return self + + def quantify(self, instances): + prev_predictions = [] + for q in self.mcsqs: + prevalence_i = q.quantify(instances) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + + +class MCMQ(BaseQuantifier): + def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5): + self.merge_fun = merge_fun + self.scmqs = [] + for classifier in classifiers: + self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split)) + + def fit(self, data: LabelledCollection): + for q in self.scmqs: + q.fit(data) + return self + + def quantify(self, instances): + prev_predictions = [] + for q in self.scmqs: + prevalence_i = q.quantify(instances) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 0c600ca..ad2825f 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -248,13 +248,13 @@ class GridSearchQ(BaseQuantifier): self.param_scores_[str(params)] = status.status self.error_collector.append(status) - tend = time()-tinit + self.fit_time_ = time()-tinit if self.best_score_ is None: raise ValueError('no combination of hyperparameters seemed to work') self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) ' - f'[took {tend:.4f}s]') + f'[took {self.fit_time_:.4f}s]') no_errors = len(self.error_collector) if no_errors>0: