forked from moreo/QuaPy
94 lines
5.0 KiB
Python
94 lines
5.0 KiB
Python
from sklearn import clone
|
|
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
|
import numpy as np
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
|
import quapy as qp
|
|
from data import LabelledCollection
|
|
from method.base import BaseQuantifier
|
|
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier, CC, ACC, PCC, PACC
|
|
|
|
|
|
"""
|
|
Possible extensions:
|
|
- add CC and ClassWeightCC
|
|
- understanding how to optimize hyper-parameters for the final PCC quantifier. It is not trivial, since once
|
|
class_weight has been set, the C parameter plays a secondary role. The reason is that I hardly doubt that
|
|
the cross-validation is taking into account the fact that one class might be more important than the other,
|
|
and so the best C parameter for quantifying, conditioned on this class prevelance, has nothing to do with the
|
|
best C for classifying the current data... Unless I define an evaluation metric weighting for each class weight,
|
|
but this is very tricky (it is like implementing the "adjustment" in the evaluation metric...)
|
|
- might be worth investigating deeper about the role of CV, and val_split, in ACC/PACC. Is it something that
|
|
consistently deliver improved accuracies (for quantification) or there is a tricky trade-off between the data
|
|
usage, the instability due to adjusting for slightly different quantifiers, and so on?
|
|
- argue that this method is only interesting in cases in which we have few data (adjustment discards data),
|
|
and not when the classifier is a costly one (we require training during test). Argue that the computational
|
|
burden can be transfered to the training stage, by training many LR for different class_weight ratios, and
|
|
then using the most similar one, to the guessed prevalence, during test.
|
|
- better investigate the "iterative" nature of the method.
|
|
- better investigate the implications with other learners. E.g., using EMQ as a prompt, or using EMQ in the second
|
|
stage (test).
|
|
- test with SVM (not working well... and problematic due to the fact that svms need to be calibrated)
|
|
- test in multiclass scenarios
|
|
"""
|
|
|
|
class ClassWeightPCC(BaseQuantifier):
|
|
|
|
def __init__(self, estimator=LogisticRegression, **pcc_param_grid):
|
|
self.estimator = estimator
|
|
self.learner = PACC(self.estimator())
|
|
if 'class_weight' in pcc_param_grid:
|
|
raise ValueError('parameter "class_weight" cannot be included in "pcc_param_grid"')
|
|
self.pcc_param_grid = dict(pcc_param_grid)
|
|
self.deployed = False
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
|
self.train = data
|
|
self.learner.fit(self.train)
|
|
return self
|
|
|
|
def quantify(self, instances):
|
|
guessed_prevalence = self.learner.quantify(instances)
|
|
class_weight = self._get_class_weight(guessed_prevalence)
|
|
if self.pcc_param_grid and self.deployed:
|
|
"""If the param grid has been specified, then use it to find good hyper-parameters for the classifier.
|
|
In this case, we know (an approximation of) the target prevalence, so we might simply want to optimize
|
|
for classification (and not for quantification)"""
|
|
# pcc = PCC(GridSearchCV(LogisticRegression(class_weight=class_weight), param_grid=self.pcc_param_grid, n_jobs=-1))
|
|
pcc = PCC(LogisticRegressionCV(Cs=self.pcc_param_grid['C'], class_weight=class_weight, n_jobs=-1, cv=3))
|
|
raise ValueError('this cannot work...')
|
|
else:
|
|
"""If the param grid has not been specified, we take the best parameters found for the base quantifier"""
|
|
base_parameters = dict(self.learner.get_params())
|
|
for p,v in self.learner.get_params().items():
|
|
# this search is in order to allow for quantifiers that work with a CalibratedClassifierCV to work
|
|
if 'class_weight' in p:
|
|
base_parameters[p] = class_weight
|
|
break
|
|
base_estimator = clone(self.learner.learner)
|
|
base_estimator.set_params(**base_parameters)
|
|
pcc = PCC(base_estimator)
|
|
return pcc.fit(self.train).quantify(instances)
|
|
|
|
def _get_class_weight(self, prevalence):
|
|
# class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence))
|
|
# return {0: class_weight[1], 1: class_weight[0]}
|
|
# weights = prevalence/prevalence.min()
|
|
weights = prevalence / self.train.prevalence()
|
|
normfactor = weights.min()
|
|
if normfactor <= 0:
|
|
normfactor = 1E-3
|
|
weights /= normfactor
|
|
return {0:weights[0], 1:weights[1]}
|
|
|
|
def set_params(self, **parameters):
|
|
# parameters = {p:v for p,v in parameters.items()}
|
|
# print(parameters)
|
|
self.learner.set_params(**parameters)
|
|
|
|
def get_params(self, deep=True):
|
|
return self.learner.get_params()
|
|
|
|
@property
|
|
def classes_(self):
|
|
return self.train.classes_ |