1
0
Fork 0

experimental method ave-pool, not working due to the fact that onevsall is aggregative and ave-pool is not

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-20 17:03:12 +01:00
parent 99132c8166
commit 1ba0748b59
4 changed files with 106 additions and 14 deletions

View File

@ -1,7 +1,15 @@
from typing import Union
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import quapy as qp
from typing import Union
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.method.aggregative import PACC, EMQ, HDy
import quapy.functional as F
from tqdm import tqdm
class PACCSLD(PACC):
@ -35,3 +43,83 @@ class HDySLD(HDy):
def aggregate(self, classif_posteriors):
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
return super(HDySLD, self).aggregate(posteriors)
class AveragePoolQuantification(BinaryQuantifier):
def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False):
self.learner = learner
self.sample_size = sample_size
self.trials = trials
self.do_zscore = zscore
self.zscore = StandardScaler() if self.do_zscore else None
self.do_pca = n_components>0
self.pca = PCA(n_components) if self.do_pca else None
def fit(self, data: LabelledCollection):
training, validation = data.split_stratified(train_prop=0.7)
X, y = [], []
nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes)
for sample in tqdm(
training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1),
desc='generating averages'
):
X.append(sample.instances.mean(axis=0))
y.append(sample.prevalence()[1])
while len(X) < self.trials:
sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes))
X.append(sample.instances.mean(axis=0))
y.append(sample.prevalence())
X = np.asarray(np.vstack(X))
y = np.asarray(y)
if self.do_pca:
X = self.pca.fit_transform(X)
print(X.shape)
if self.do_zscore:
X = self.zscore.fit_transform(X)
print('training regressor...')
self.regressor = self.learner.fit(X, y)
# correction at 0:
print('getting corrections...')
X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
if self.do_pca:
X0 = self.pca.transform(X0)
X1 = self.pca.transform(X1)
if self.do_zscore:
X0 = self.zscore.transform(X0)
X1 = self.zscore.transform(X1)
self.correction_0 = self.regressor.predict(X0).mean()
self.correction_1 = self.regressor.predict(X1).mean()
print('correction-0', self.correction_0)
print('correction-1', self.correction_1)
print('done')
def quantify(self, instances):
ave = np.asarray(instances.mean(axis=0))
if self.do_pca:
ave = self.pca.transform(ave)
if self.do_zscore:
ave = self.zscore.transform(ave)
phat = self.regressor.predict(ave).item()
phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1)
return np.asarray([1-phat, phat])
def set_params(self, **parameters):
self.learner.set_params(**parameters)
def get_params(self, deep=True):
return self.learner.get_params(deep=deep)

View File

@ -32,6 +32,7 @@ nice = {
'quanet': 'QuaNet',
'hdy': 'HDy',
'dys': 'DyS',
'epaccmaeptr': 'E(PACC)$_\mathrm{Ptr}$',
'svmperf':'',
'sanders': 'Sanders',
'semeval13': 'SemEval13',
@ -116,7 +117,7 @@ if __name__ == '__main__':
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
evaluation_measures = [qp.error.ae, qp.error.rae]
gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
new_methods = ['hdy']
new_methods = ['hdy', 'quanet', 'epaccptr']
gao_seb_ranks, gao_seb_results = get_ranks_from_Gao_Sebastiani()

View File

@ -547,8 +547,6 @@ class OneVsAll(AggregativeQuantifier):
else:
predictions = self.classify(X)
return self.aggregate(predictions)
#prevalences = self.__parallel(self._delayed_binary_quantify, X)
#return F.normalize_prevalence(prevalences)
def __parallel(self, func, *args, **kwargs):
return np.asarray(

25
test.py
View File

@ -1,10 +1,12 @@
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVC, LinearSVR
import quapy as qp
import quapy.functional as F
import sys
import numpy as np
from NewMethods.methods import AveragePoolQuantification
from classification.methods import PCALR
from classification.neural import NeuralClassifierTrainer, CNNnet
from quapy.model_selection import GridSearchQ
@ -29,7 +31,7 @@ if binary:
else:
dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
#dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
@ -51,14 +53,17 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes
#model = qp.method.meta.QuaNet(learner, sample_size, device='cpu')
#learner = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, n_jobs=-1, verbose=1)
learner = LogisticRegression(max_iter=1000)
#learner = LogisticRegression(max_iter=1000)
# model = qp.method.aggregative.ClassifyAndCount(learner)
model = qp.method.meta.EPACC(learner, size=10, red_size=5,
param_grid={'C':[1,10,100]},
optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5},
policy='ptr', n_jobs=1)
#model = qp.method.meta.EPACC(learner, size=10, red_size=5,
# param_grid={'C':[1,10,100]},
# optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5},
# policy='ptr', n_jobs=1)
regressor = LinearSVR(max_iter=10000)
param_grid = {'C': np.logspace(-1,3,5)}
model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False)
#model = qp.method.meta.EHDy(learner, param_grid=param_grid, optim='mae',
# sample_size=sample_size, eval_budget=max_evaluations//10, n_jobs=-1)
@ -75,7 +80,7 @@ if qp.isbinary(model) and not qp.isbinary(dataset):
print(f'fitting model {model.__class__.__name__}')
#train, val = dataset.training.split_stratified(0.6)
#model.fit(train, val_split=val)
model.fit(dataset.training, val_split=dataset.test)
model.fit(dataset.training)
@ -112,7 +117,7 @@ for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev)
print(f'{error.__name__}={score:.5f}')
sys.exit(0)
#sys.exit(0)
# Model selection and Evaluation according to the artificial sampling protocol
# ----------------------------------------------------------------------------
@ -123,7 +128,7 @@ model_selection = GridSearchQ(model,
error='mae',
refit=True,
verbose=True,
timeout=4)
timeout=60*60)
model = model_selection.fit(dataset.training, val_split=0.3)
#model = model_selection.fit(train, validation=val)