forked from moreo/QuaPy
306 lines
11 KiB
Python
306 lines
11 KiB
Python
import itertools
|
|
from functools import cache
|
|
|
|
import numpy as np
|
|
from densratio import densratio
|
|
from scipy.sparse import issparse, vstack
|
|
from scipy.stats import multivariate_normal
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
|
import quapy as qp
|
|
from Transduction_office.pykliep import DensityRatioEstimator
|
|
from quapy.protocol import AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol
|
|
from quapy.data import LabelledCollection
|
|
from quapy.method.aggregative import *
|
|
import quapy.functional as F
|
|
from time import time
|
|
|
|
|
|
def gaussian(mean, cov=1., label=0, size=100, random_state=0):
|
|
"""
|
|
Creates a label collection in which the instances are distributed according to a Gaussian with specified
|
|
parameters and labels all data points with a specific label.
|
|
|
|
:param mean: ndarray of shape (n_dimensions) with the center
|
|
:param cov: ndarray of shape (n_dimensions, n_dimensions) with the covariance matrix, or a number for np.eye
|
|
:param label: the class label for the collection
|
|
:param size: number of instances
|
|
:param random_state: allows for replicating experiments
|
|
:return: an instance of LabelledCollection
|
|
"""
|
|
mean = np.asarray(mean)
|
|
assert mean.ndim==1, 'wrong shape for mean'
|
|
n_features = mean.shape[0]
|
|
if isinstance(cov, (int, float)):
|
|
cov = np.eye(n_features) * cov
|
|
instances = multivariate_normal.rvs(mean, cov, size, random_state=random_state)
|
|
return LabelledCollection(instances, labels=[label]*size)
|
|
|
|
|
|
# ------------------------------------------------------------------------------------
|
|
# Protocol for generating prior probability shift + covariate shift by mixing "domains"
|
|
# ------------------------------------------------------------------------------------
|
|
class CovPriorShift(AbstractStochasticSeededProtocol):
|
|
|
|
def __init__(self, domains: list[LabelledCollection], sample_size=None, repeats=100, min_support=0, random_state=0,
|
|
return_type='sample_prev'):
|
|
super(CovPriorShift, self).__init__(random_state)
|
|
self.domains = list(itertools.chain.from_iterable(lc.separate() for lc in domains))
|
|
self.sample_size = qp._get_sample_size(sample_size)
|
|
self.repeats = repeats
|
|
self.min_support = min_support
|
|
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
|
|
|
def samples_parameters(self):
|
|
"""
|
|
Return all the necessary parameters to replicate the samples as according to the UPP protocol.
|
|
|
|
:return: a list of indexes that realize the UPP sampling
|
|
"""
|
|
indexes = []
|
|
tentatives = 0
|
|
while len(indexes) < self.repeats:
|
|
alpha = F.uniform_simplex_sampling(n_classes=len(self.domains))
|
|
# sizes = np.asarray([round(len(lc_i) * alpha_i) for lc_i, alpha_i in zip(self.domains, alpha)])
|
|
sizes = (alpha * self.sample_size).astype(int)
|
|
if all(sizes > self.min_support):
|
|
indexes_i = [lc.sampling_index(size) for lc, size in zip(self.domains, sizes)]
|
|
indexes.append(indexes_i)
|
|
tentatives = 0
|
|
else:
|
|
tentatives += 1
|
|
if tentatives > 100:
|
|
raise ValueError('the support is too strict, and it is difficult '
|
|
'or impossible to generate valid samples')
|
|
return indexes
|
|
|
|
def sample(self, params):
|
|
indexes = params
|
|
lcs = [lc.sampling_from_index(index) for index, lc in zip(indexes, self.domains)]
|
|
return LabelledCollection.join(*lcs)
|
|
|
|
def total(self):
|
|
"""
|
|
Returns the number of samples that will be generated
|
|
|
|
:return: int
|
|
"""
|
|
return self.repeats
|
|
|
|
|
|
# ---------------------------------------------------------------------------------------
|
|
# Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg)
|
|
# ---------------------------------------------------------------------------------------
|
|
class ImportanceWeight:
|
|
@abstractmethod
|
|
def weights(self, Xtr, ytr, Xte):
|
|
pass
|
|
|
|
|
|
class KLIEP(ImportanceWeight):
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def weights(self, Xtr, ytr, Xte):
|
|
kliep = DensityRatioEstimator()
|
|
kliep.fit(Xtr, Xte)
|
|
return kliep.predict(Xtr)
|
|
|
|
|
|
class USILF(ImportanceWeight):
|
|
|
|
def __init__(self, alpha=0.):
|
|
self.alpha = alpha
|
|
|
|
def weights(self, Xtr, ytr, Xte):
|
|
dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False)
|
|
return dense_ratio_obj.compute_density_ratio(Xtr)
|
|
|
|
|
|
class LogReg(ImportanceWeight):
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def weights(self, Xtr, ytr, Xte):
|
|
# check "Direct Density Ratio Estimation for
|
|
# Large-scale Covariate Shift Adaptation", Eq.28
|
|
|
|
if issparse(Xtr):
|
|
X = vstack([Xtr, Xte])
|
|
else:
|
|
X = np.concatenate([Xtr, Xte])
|
|
|
|
y = [0]*len(Xtr) + [1]*len(Xte)
|
|
|
|
logreg = GridSearchCV(
|
|
LogisticRegression(),
|
|
param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
|
|
n_jobs=-1
|
|
)
|
|
logreg.fit(X, y)
|
|
prob_train = logreg.predict_proba(Xtr)[:,0]
|
|
prob_test = logreg.predict_proba(Xtr)[:,1]
|
|
prior_train = len(Xtr)
|
|
prior_test = len(Xte)
|
|
w = (prior_train/prior_test)*(prob_test/prob_train)
|
|
return w
|
|
|
|
|
|
class MostTest(ImportanceWeight):
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def weights(self, Xtr, ytr, Xte):
|
|
# check "Direct Density Ratio Estimation for
|
|
# Large-scale Covariate Shift Adaptation", Eq.28
|
|
|
|
if issparse(Xtr):
|
|
X = vstack([Xtr, Xte])
|
|
else:
|
|
X = np.concatenate([Xtr, Xte])
|
|
|
|
y = [0]*len(Xtr) + [1]*len(Xte)
|
|
|
|
logreg = GridSearchCV(
|
|
LogisticRegression(),
|
|
param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
|
|
n_jobs=-1
|
|
)
|
|
# logreg = LogisticRegression()
|
|
# logreg.fit(X, y)
|
|
# prob_test = logreg.predict_proba(Xtr)[:,1]
|
|
prob_test = cross_val_predict(logreg, X, y, n_jobs=-1, method="predict_proba")[:len(Xtr),1]
|
|
return prob_test
|
|
|
|
|
|
class Random(ImportanceWeight):
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def weights(self, Xtr, ytr, Xte):
|
|
return np.random.rand(len(Xtr))
|
|
|
|
# --------------------------------------------------------------------------------------------
|
|
# Quantification Methods that rely on Importance Weight for reweighting the training instances
|
|
# --------------------------------------------------------------------------------------------
|
|
class TransductiveQuantifier(BaseQuantifier):
|
|
|
|
def fit(self, data: LabelledCollection):
|
|
self.training_ = data
|
|
return self
|
|
|
|
@property
|
|
def training(self):
|
|
return self.training_
|
|
|
|
|
|
class ReweightingAggregative(TransductiveQuantifier):
|
|
|
|
def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=CC):
|
|
self.classifier = classifier
|
|
self.weighter = weighter
|
|
self.quantif_method = quantif_method
|
|
|
|
def quantify(self, instances):
|
|
# time_weight = 2.95340 time_train = 0.00619
|
|
w = self.weighter.weights(*self.training.Xy, instances)
|
|
self.classifier.fit(*self.training.Xy, sample_weight=w)
|
|
quantifier = self.quantif_method(self.classifier).fit(self.training, fit_classifier=False)
|
|
return quantifier.quantify(instances)
|
|
|
|
|
|
# --------------------------------------------------------------------------------------------
|
|
# Quantification Methods that rely on Importance Weight for selecting a validation partition
|
|
# --------------------------------------------------------------------------------------------
|
|
|
|
def select_from_weights(w, data: LabelledCollection, val_prop=0.4):
|
|
# w[w<1]=0
|
|
order = np.argsort(w)
|
|
split_point = int(len(w)*val_prop)
|
|
train_idx, val_idx = order[:-split_point], order[-split_point:]
|
|
return data.sampling_from_index(train_idx), data.sampling_from_index(val_idx)
|
|
|
|
|
|
class SelectorQuantifiers(TransductiveQuantifier):
|
|
|
|
def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, val_split=0.4):
|
|
self.classifier = classifier
|
|
self.weighter = weighter
|
|
self.quantif_method = quantif_method
|
|
self.val_split = val_split
|
|
|
|
def quantify(self, instances):
|
|
w = self.weighter.weights(*self.training.Xy, instances)
|
|
train, val = select_from_weights(w, self.training, self.val_split)
|
|
quantifier = self.quantif_method(self.classifier).fit(train, val_split=val)
|
|
return quantifier.quantify(instances)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
qp.environ['SAMPLE_SIZE'] = 500
|
|
|
|
dA_l0 = gaussian(mean=[0,0], label=0, size=1000)
|
|
dA_l1 = gaussian(mean=[1,0], label=1, size=1000)
|
|
dB_l0 = gaussian(mean=[0,1], label=0, size=1000)
|
|
dB_l1 = gaussian(mean=[1,1], label=1, size=1000)
|
|
|
|
dA = LabelledCollection.join(dA_l0, dA_l1)
|
|
dB = LabelledCollection.join(dB_l0, dB_l1)
|
|
|
|
dA_train, dA_test = dA.split_stratified(0.5, random_state=0)
|
|
dB_train, dB_test = dB.split_stratified(0.5, random_state=0)
|
|
|
|
train = LabelledCollection.join(dA_train, dB_train)
|
|
|
|
def lr():
|
|
return LogisticRegression()
|
|
|
|
# def lr():
|
|
# return GridSearchCV(
|
|
# LogisticRegression(),
|
|
# param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
|
|
# n_jobs=-1
|
|
# )
|
|
|
|
methods = [
|
|
('CC', CC(lr())),
|
|
('PCC', PCC(lr())),
|
|
('ACC', ACC(lr())),
|
|
('PACC', PACC(lr())),
|
|
('HDy', EMQ(lr())),
|
|
('EMQ', EMQ(lr())),
|
|
('Sel-ACC', SelectorQuantifiers(lr(), MostTest(), ACC)),
|
|
('Sel-PACC', SelectorQuantifiers(lr(), MostTest(), PACC)),
|
|
('Sel-HDy', SelectorQuantifiers(lr(), MostTest(), HDy)),
|
|
('LogReg-CC', ReweightingAggregative(lr(), LogReg(), CC)),
|
|
('LogReg-PCC', ReweightingAggregative(lr(), LogReg(), PCC)),
|
|
('LogReg-EMQ', ReweightingAggregative(lr(), LogReg(), EMQ)),
|
|
# ('KLIEP-CC', TransductiveAggregative(lr(), KLIEP(), CC)),
|
|
# ('KLIEP-PCC', TransductiveAggregative(lr(), KLIEP(), PCC)),
|
|
# ('KLIEP-EMQ', TransductiveAggregative(lr(), KLIEP(), EMQ)),
|
|
# ('SILF-CC', TransductiveAggregative(lr(), USILF(), CC)),
|
|
# ('SILF-PCC', TransductiveAggregative(lr(), USILF(), PCC)),
|
|
# ('SILF-EMQ', TransductiveAggregative(lr(), USILF(), EMQ))
|
|
]
|
|
|
|
for name, model in methods:
|
|
with qp.util.temp_seed(1):
|
|
model.fit(train)
|
|
|
|
prot = CovPriorShift([dA_test, dB_test], repeats=10)
|
|
mae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mae')
|
|
print(f'{name}: {mae = :.4f}')
|
|
# mrae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mrae')
|
|
# print(f'{name}: {mrae = :.4f}')
|
|
|
|
|
|
|
|
|
|
|