from sklearn import clone from sklearn.linear_model import LogisticRegression, LogisticRegressionCV import numpy as np from sklearn.model_selection import GridSearchCV import quapy as qp from data import LabelledCollection from method.base import BaseQuantifier from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier, CC, ACC, PCC, PACC """ Possible extensions: - add CC and ClassWeightCC - understanding how to optimize hyper-parameters for the final PCC quantifier. It is not trivial, since once class_weight has been set, the C parameter plays a secondary role. The reason is that I hardly doubt that the cross-validation is taking into account the fact that one class might be more important than the other, and so the best C parameter for quantifying, conditioned on this class prevelance, has nothing to do with the best C for classifying the current data... Unless I define an evaluation metric weighting for each class weight, but this is very tricky (it is like implementing the "adjustment" in the evaluation metric...) - might be worth investigating deeper about the role of CV, and val_split, in ACC/PACC. Is it something that consistently deliver improved accuracies (for quantification) or there is a tricky trade-off between the data usage, the instability due to adjusting for slightly different quantifiers, and so on? - argue that this method is only interesting in cases in which we have few data (adjustment discards data), and not when the classifier is a costly one (we require training during test). Argue that the computational burden can be transfered to the training stage, by training many LR for different class_weight ratios, and then using the most similar one, to the guessed prevalence, during test. - better investigate the "iterative" nature of the method. - better investigate the implications with other learners. E.g., using EMQ as a prompt, or using EMQ in the second stage (test). - test with SVM (not working well... and problematic due to the fact that svms need to be calibrated) - test in multiclass scenarios """ class ClassWeightPCC(BaseQuantifier): def __init__(self, estimator=LogisticRegression, **pcc_param_grid): self.estimator = estimator self.learner = PACC(self.estimator()) if 'class_weight' in pcc_param_grid: raise ValueError('parameter "class_weight" cannot be included in "pcc_param_grid"') self.pcc_param_grid = dict(pcc_param_grid) self.deployed = False def fit(self, data: LabelledCollection, fit_learner=True): self.train = data self.learner.fit(self.train) return self def quantify(self, instances): guessed_prevalence = self.learner.quantify(instances) class_weight = self._get_class_weight(guessed_prevalence) if self.pcc_param_grid and self.deployed: """If the param grid has been specified, then use it to find good hyper-parameters for the classifier. In this case, we know (an approximation of) the target prevalence, so we might simply want to optimize for classification (and not for quantification)""" # pcc = PCC(GridSearchCV(LogisticRegression(class_weight=class_weight), param_grid=self.pcc_param_grid, n_jobs=-1)) pcc = PCC(LogisticRegressionCV(Cs=self.pcc_param_grid['C'], class_weight=class_weight, n_jobs=-1, cv=3)) raise ValueError('this cannot work...') else: """If the param grid has not been specified, we take the best parameters found for the base quantifier""" base_parameters = dict(self.learner.get_params()) for p,v in self.learner.get_params().items(): # this search is in order to allow for quantifiers that work with a CalibratedClassifierCV to work if 'class_weight' in p: base_parameters[p] = class_weight break base_estimator = clone(self.learner.learner) base_estimator.set_params(**base_parameters) pcc = PCC(base_estimator) return pcc.fit(self.train).quantify(instances) def _get_class_weight(self, prevalence): # class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence)) # return {0: class_weight[1], 1: class_weight[0]} # weights = prevalence/prevalence.min() weights = prevalence / self.train.prevalence() normfactor = weights.min() if normfactor <= 0: normfactor = 1E-3 weights /= normfactor return {0:weights[0], 1:weights[1]} def set_params(self, **parameters): # parameters = {p:v for p,v in parameters.items()} # print(parameters) self.learner.set_params(**parameters) def get_params(self, deep=True): return self.learner.get_params() @property def classes_(self): return self.train.classes_