1
0
Fork 0
QuaPy/NewMethods/class_weight_model.py

94 lines
5.0 KiB
Python

from sklearn import clone
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import numpy as np
from sklearn.model_selection import GridSearchCV
import quapy as qp
from data import LabelledCollection
from method.base import BaseQuantifier
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier, CC, ACC, PCC, PACC
"""
Possible extensions:
- add CC and ClassWeightCC
- understanding how to optimize hyper-parameters for the final PCC quantifier. It is not trivial, since once
class_weight has been set, the C parameter plays a secondary role. The reason is that I hardly doubt that
the cross-validation is taking into account the fact that one class might be more important than the other,
and so the best C parameter for quantifying, conditioned on this class prevelance, has nothing to do with the
best C for classifying the current data... Unless I define an evaluation metric weighting for each class weight,
but this is very tricky (it is like implementing the "adjustment" in the evaluation metric...)
- might be worth investigating deeper about the role of CV, and val_split, in ACC/PACC. Is it something that
consistently deliver improved accuracies (for quantification) or there is a tricky trade-off between the data
usage, the instability due to adjusting for slightly different quantifiers, and so on?
- argue that this method is only interesting in cases in which we have few data (adjustment discards data),
and not when the classifier is a costly one (we require training during test). Argue that the computational
burden can be transfered to the training stage, by training many LR for different class_weight ratios, and
then using the most similar one, to the guessed prevalence, during test.
- better investigate the "iterative" nature of the method.
- better investigate the implications with other learners. E.g., using EMQ as a prompt, or using EMQ in the second
stage (test).
- test with SVM (not working well... and problematic due to the fact that svms need to be calibrated)
- test in multiclass scenarios
"""
class ClassWeightPCC(BaseQuantifier):
def __init__(self, estimator=LogisticRegression, **pcc_param_grid):
self.estimator = estimator
self.learner = PACC(self.estimator())
if 'class_weight' in pcc_param_grid:
raise ValueError('parameter "class_weight" cannot be included in "pcc_param_grid"')
self.pcc_param_grid = dict(pcc_param_grid)
self.deployed = False
def fit(self, data: LabelledCollection, fit_learner=True):
self.train = data
self.learner.fit(self.train)
return self
def quantify(self, instances):
guessed_prevalence = self.learner.quantify(instances)
class_weight = self._get_class_weight(guessed_prevalence)
if self.pcc_param_grid and self.deployed:
"""If the param grid has been specified, then use it to find good hyper-parameters for the classifier.
In this case, we know (an approximation of) the target prevalence, so we might simply want to optimize
for classification (and not for quantification)"""
# pcc = PCC(GridSearchCV(LogisticRegression(class_weight=class_weight), param_grid=self.pcc_param_grid, n_jobs=-1))
pcc = PCC(LogisticRegressionCV(Cs=self.pcc_param_grid['C'], class_weight=class_weight, n_jobs=-1, cv=3))
raise ValueError('this cannot work...')
else:
"""If the param grid has not been specified, we take the best parameters found for the base quantifier"""
base_parameters = dict(self.learner.get_params())
for p,v in self.learner.get_params().items():
# this search is in order to allow for quantifiers that work with a CalibratedClassifierCV to work
if 'class_weight' in p:
base_parameters[p] = class_weight
break
base_estimator = clone(self.learner.learner)
base_estimator.set_params(**base_parameters)
pcc = PCC(base_estimator)
return pcc.fit(self.train).quantify(instances)
def _get_class_weight(self, prevalence):
# class_weight = compute_class_weight('balanced', classes=[0, 1], y=mock_y(prevalence))
# return {0: class_weight[1], 1: class_weight[0]}
# weights = prevalence/prevalence.min()
weights = prevalence / self.train.prevalence()
normfactor = weights.min()
if normfactor <= 0:
normfactor = 1E-3
weights /= normfactor
return {0:weights[0], 1:weights[1]}
def set_params(self, **parameters):
# parameters = {p:v for p,v in parameters.items()}
# print(parameters)
self.learner.set_params(**parameters)
def get_params(self, deep=True):
return self.learner.get_params()
@property
def classes_(self):
return self.train.classes_