QuaPy/MultiLabel/mlquantification.py

362 lines
13 KiB
Python

import numpy as np
from copy import deepcopy
import sklearn.preprocessing
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, MultiTaskLassoCV, LassoLars, LassoLarsCV, \
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
import quapy as qp
from MultiLabel.mlclassification import MLStackedClassifier, MLStackedRegressor
from MultiLabel.mldata import MultilabelledCollection
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
from method.base import BaseQuantifier
from abc import abstractmethod
class MLQuantifier:
@abstractmethod
def fit(self, data: MultilabelledCollection): ...
@abstractmethod
def quantify(self, instances): ...
class MLMLPE(MLQuantifier):
def fit(self, data: MultilabelledCollection):
self.tr_prev = data.prevalence()
return self
def quantify(self, instances):
return self.tr_prev
class MLAggregativeQuantifier(MLQuantifier):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection):
self.learner.fit(*data.Xy)
return self
@abstractmethod
def preclassify(self, instances): ...
@abstractmethod
def aggregate(self, predictions): ...
def quantify(self, instances):
predictions = self.preclassify(instances)
return self.aggregate(predictions)
class MLCC(MLAggregativeQuantifier):
def preclassify(self, instances):
return self.learner.predict(instances)
def aggregate(self, predictions):
pos_prev = predictions.mean(axis=0)
neg_prev = 1 - pos_prev
return np.asarray([neg_prev, pos_prev]).T
class MLPCC(MLCC):
def preclassify(self, instances):
return self.learner.predict_proba(instances)
class MLACC(MLCC):
def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
self.Pte_cond_estim_ = []
for c in data.classes_:
pos_c = val.labels[:,c].sum()
neg_c = len(val) - pos_c
self.Pte_cond_estim_.append(confusion_matrix(val.labels[:,c], val_predictions[:,c]).T / np.array([neg_c, pos_c]))
return self
def preclassify(self, instances):
return self.learner.predict(instances)
def aggregate(self, predictions):
cc_prevs = super(MLACC, self).aggregate(predictions)
acc_prevs = np.asarray([ACC.solve_adjustment(self.Pte_cond_estim_[c], cc_prevs[c]) for c in self.classes_])
return acc_prevs
class MLPACC(MLPCC):
def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_posteriors = self.preclassify(val.instances)
self.Pte_cond_estim_ = []
for c in data.classes_:
pos_posteriors = val_posteriors[:,c]
c_posteriors = np.asarray([1-pos_posteriors, pos_posteriors]).T
self.Pte_cond_estim_.append(PACC.getPteCondEstim([0,1], val.labels[:,c], c_posteriors))
return self
def aggregate(self, posteriors):
pcc_prevs = super(MLPACC, self).aggregate(posteriors)
pacc_prevs = np.asarray([ACC.solve_adjustment(self.Pte_cond_estim_[c], pcc_prevs[c]) for c in self.classes_])
return pacc_prevs
class MLNaiveQuantifier(MLQuantifier):
def __init__(self, q:BaseQuantifier, n_jobs=-1):
self.q = q
self.estimators = None
self.n_jobs = n_jobs
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
def cat_job(lc):
return deepcopy(self.q).fit(lc)
self.estimators = qp.util.parallel(cat_job, data.genLabelledCollections(), n_jobs=self.n_jobs)
return self
def quantify(self, instances):
pos_prevs = np.zeros(len(self.classes_), dtype=float)
for c in self.classes_:
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
neg_prevs = 1-pos_prevs
return np.asarray([neg_prevs, pos_prevs]).T
class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
self.q = q
self.estimators = None
self.n_jobs = n_jobs
def preclassify(self, instances):
return np.asarray([q.preclassify(instances) for q in self.estimators]).swapaxes(0,1)
def aggregate(self, predictions):
pos_prevs = np.zeros(len(self.classes_), dtype=float)
for c in self.classes_:
pos_prevs[c] = self.estimators[c].aggregate(predictions[:,c])[1]
neg_prevs = 1 - pos_prevs
return np.asarray([neg_prevs, pos_prevs]).T
def quantify(self, instances):
predictions = self.preclassify(instances)
return self.aggregate(predictions)
class MLRegressionQuantification:
def __init__(self,
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge',
protocol='npp',
n_samples=500,
sample_size=500,
norm=True,
means=True,
stds=True):
assert protocol in ['npp', 'app'], 'unknown protocol'
self.estimator = mlquantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
self.protocol = protocol
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.n_samples = n_samples
self.sample_size = sample_size
# self.norm = StandardScaler()
self.means = means
self.stds = stds
# self.covs = covs
def _prepare_arrays(self, Xs, ys, samples_mean, samples_std):
Xs = np.asarray(Xs)
ys = np.asarray(ys)
if self.means:
samples_mean = np.asarray(samples_mean)
Xs = np.hstack([Xs, samples_mean])
if self.stds:
samples_std = np.asarray(samples_std)
Xs = np.hstack([Xs, samples_std])
# if self.covs:
return Xs, ys
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
ys.append(sample.prevalence()[:, 1])
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
def generate_samples_npp(self, val):
Xs, ys = [], []
samples_mean, samples_std = [], []
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def generate_samples_app(self, val):
Xs, ys = [], []
samples_mean, samples_std = [], []
ncats = len(self.classes_)
nprevs = 21
repeats = max(self.n_samples // (ncats * nprevs), 1)
for cat in self.classes_:
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
tr, val = data.train_test_split()
self.estimator.fit(tr)
if self.protocol == 'npp':
Xs, ys = self.generate_samples_npp(val)
elif self.protocol == 'app':
Xs, ys = self.generate_samples_app(val)
# Xs = self.norm.fit_transform(Xs)
self.reg.fit(Xs, ys)
return self
def quantify(self, instances):
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
if self.means:
sample_mean = instances.mean(axis=0).getA()
Xs = np.hstack([Xs, sample_mean])
if self.stds:
sample_std = instances.todense().std(axis=0).getA()
Xs = np.hstack([Xs, sample_std])
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs)
# Xs = self.norm.inverse_transform(Xs)
adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted.flatten()
neg_prevs = 1-adjusted
return np.asarray([neg_prevs, adjusted]).T
class StackMLRQuantifier:
def __init__(self,
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge',
protocol='npp',
n_samples=500,
sample_size=500,
norm=True,
means=True,
stds=True):
if regression == 'ridge':
reg = MLStackedRegressor(Ridge(normalize=True))
elif regression == 'svr':
reg = MLStackedRegressor(MultiOutputRegressor(LinearSVR()))
else:
ValueError(f'unknown regressor {regression}')
self.base = MLRegressionQuantification(
mlquantifier=mlquantifier,
regression=reg,
protocol=protocol,
n_samples=n_samples,
sample_size=sample_size,
norm=norm,
means=means,
stds=stds)
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
self.base.fit(data)
return self
def quantify(self, instances):
return self.base.quantify(instances)
class MLadjustedCount(MLAggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def preclassify(self, instances):
return self.learner.predict(instances)
def fit(self, data: MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
val_true = val.labels
N = len(val)
C = val_predictions.T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
return self
def aggregate(self, predictions):
P = sklearn.preprocessing.normalize(predictions, norm='l1')
correction = P.dot(self.Pte_cond_estim_)
adjusted = correction.mean(axis=0)
return np.asarray([1-adjusted, adjusted]).T
class MLprobAdjustedCount(MLAggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def preclassify(self, instances):
return self.learner.predict_proba(instances)
def fit(self, data: MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
val_true = val.labels
N = len(val)
C = (val_predictions>0.5).T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
# not sure...
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
return self
def aggregate(self, predictions):
P = sklearn.preprocessing.normalize(predictions, norm='l1')
correction = P.dot(self.Pte_cond_estim_)
adjusted = correction.mean(axis=0)
return np.asarray([1-adjusted, adjusted]).T