forked from moreo/QuaPy
cleaning
This commit is contained in:
parent
168c109794
commit
1aafd10e25
|
@ -1,174 +0,0 @@
|
|||
import numpy as np
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import quapy as qp
|
||||
from typing import Union
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||
from quapy.method.aggregative import PACC, EMQ, HDy
|
||||
import quapy.functional as F
|
||||
from tqdm import tqdm
|
||||
from scipy.sparse import issparse, csr_matrix
|
||||
import scipy
|
||||
|
||||
|
||||
class PACCSLD(PACC):
|
||||
"""
|
||||
This method combines the EMQ improved posterior probabilities with PACC.
|
||||
Note: the posterior probabilities are re-calibrated with EMQ only during prediction, and not also during fit since,
|
||||
for PACC, the validation split is known to have the same prevalence as the training set (this is because the split
|
||||
is stratified) and thus the posterior probabilities should not be re-calibrated for a different prior (it actually
|
||||
happens to degrades performance).
|
||||
"""
|
||||
|
||||
def fit(self, data: qp.data.LabelledCollection, fit_learner=True, val_split:Union[float, int, qp.data.LabelledCollection]=0.4):
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes)
|
||||
return super(PACCSLD, self).fit(data, fit_learner, val_split)
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
|
||||
return super(PACCSLD, self).aggregate(posteriors)
|
||||
|
||||
|
||||
class HDySLD(HDy):
|
||||
"""
|
||||
This method combines the EMQ improved posterior probabilities with HDy.
|
||||
Note: [same as PACCSLD]
|
||||
"""
|
||||
def fit(self, data: qp.data.LabelledCollection, fit_learner=True,
|
||||
val_split: Union[float, int, qp.data.LabelledCollection] = 0.4):
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes)
|
||||
return super(HDySLD, self).fit(data, fit_learner, val_split)
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
|
||||
return super(HDySLD, self).aggregate(posteriors)
|
||||
|
||||
|
||||
|
||||
class AveragePoolQuantification(BinaryQuantifier):
|
||||
def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False):
|
||||
self.learner = learner
|
||||
self.sample_size = sample_size
|
||||
self.trials = trials
|
||||
|
||||
self.do_zscore = zscore
|
||||
self.zscore = StandardScaler() if self.do_zscore else None
|
||||
|
||||
self.do_pca = n_components>0
|
||||
self.pca = PCA(n_components) if self.do_pca else None
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
training, validation = data.split_stratified(train_prop=0.7)
|
||||
|
||||
X, y = [], []
|
||||
|
||||
nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes)
|
||||
for sample in tqdm(
|
||||
training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1),
|
||||
desc='generating averages'
|
||||
):
|
||||
X.append(sample.instances.mean(axis=0))
|
||||
y.append(sample.prevalence()[1])
|
||||
while len(X) < self.trials:
|
||||
sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes))
|
||||
X.append(sample.instances.mean(axis=0))
|
||||
y.append(sample.prevalence())
|
||||
X = np.asarray(np.vstack(X))
|
||||
y = np.asarray(y)
|
||||
|
||||
if self.do_pca:
|
||||
X = self.pca.fit_transform(X)
|
||||
print(X.shape)
|
||||
|
||||
if self.do_zscore:
|
||||
X = self.zscore.fit_transform(X)
|
||||
|
||||
print('training regressor...')
|
||||
self.regressor = self.learner.fit(X, y)
|
||||
|
||||
# correction at 0:
|
||||
print('getting corrections...')
|
||||
X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
|
||||
X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
|
||||
|
||||
if self.do_pca:
|
||||
X0 = self.pca.transform(X0)
|
||||
X1 = self.pca.transform(X1)
|
||||
|
||||
if self.do_zscore:
|
||||
X0 = self.zscore.transform(X0)
|
||||
X1 = self.zscore.transform(X1)
|
||||
|
||||
self.correction_0 = self.regressor.predict(X0).mean()
|
||||
self.correction_1 = self.regressor.predict(X1).mean()
|
||||
|
||||
print('correction-0', self.correction_0)
|
||||
print('correction-1', self.correction_1)
|
||||
print('done')
|
||||
|
||||
def quantify(self, instances):
|
||||
ave = np.asarray(instances.mean(axis=0))
|
||||
|
||||
if self.do_pca:
|
||||
ave = self.pca.transform(ave)
|
||||
if self.do_zscore:
|
||||
ave = self.zscore.transform(ave)
|
||||
phat = self.regressor.predict(ave).item()
|
||||
phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1)
|
||||
return np.asarray([1-phat, phat])
|
||||
|
||||
def set_params(self, **parameters):
|
||||
self.learner.set_params(**parameters)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.learner.get_params(deep=deep)
|
||||
|
||||
|
||||
class WinnowOrthogonal(BaseEstimator):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, X, y):
|
||||
self.classes_ = np.asarray(sorted(np.unique(y)))
|
||||
w1 = np.asarray(X[y == 0].mean(axis=0)).flatten()
|
||||
w2 = np.asarray(X[y == 1].mean(axis=0)).flatten()
|
||||
diff = w2 - w1
|
||||
orth = np.ones_like(diff)
|
||||
orth[0] = -diff[1:].sum() / diff[0]
|
||||
orth /= np.linalg.norm(orth)
|
||||
self.w = orth
|
||||
self.b = w1.dot(orth)
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
if issparse(X):
|
||||
Z = X.dot(csr_matrix(self.w).T).toarray().flatten()
|
||||
return Z - self.b
|
||||
else:
|
||||
return np.matmul(X, self.w) - self.b
|
||||
|
||||
def predict(self, X):
|
||||
return 1 * (self.decision_function(X) > 0)
|
||||
|
||||
def split(self, X, y):
|
||||
s = self.predict(X)
|
||||
X0a = X[np.logical_and(y == 0, s == 0)]
|
||||
X0b = X[np.logical_and(y == 0, s == 1)]
|
||||
X1a = X[np.logical_and(y == 1, s == 0)]
|
||||
X1b = X[np.logical_and(y == 1, s == 1)]
|
||||
y0a = np.zeros(X0a.shape[0], dtype=np.int)
|
||||
y0b = np.zeros(X0b.shape[0], dtype=np.int)
|
||||
y1a = np.ones(X1a.shape[0], dtype=np.int)
|
||||
y1b = np.ones(X1b.shape[0], dtype=np.int)
|
||||
return X0a, X0b, X1a, X1b, y0a, y0b, y1a, y1b
|
||||
|
||||
def get_params(self):
|
||||
return {}
|
||||
|
||||
def set_params(self, **params):
|
||||
pass
|
|
@ -1,48 +0,0 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
import quapy as qp
|
||||
from classification.methods import PCALR
|
||||
from method.meta import QuaNet
|
||||
from quapy.method.aggregative import *
|
||||
from NewMethods.methods import *
|
||||
from experiments import run, SAMPLE_SIZE
|
||||
import numpy as np
|
||||
import itertools
|
||||
from joblib import Parallel, delayed
|
||||
import settings
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
|
||||
parser.add_argument('results', metavar='RESULT_PATH', type=str, help='path to the directory where to store the results')
|
||||
#parser.add_argument('svmperfpath', metavar='SVMPERF_PATH', type=str, help='path to the directory with svmperf')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def quantification_models():
|
||||
def newLR():
|
||||
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
||||
__C_range = np.logspace(-4, 5, 10)
|
||||
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
|
||||
svmperf_params = {'C': __C_range}
|
||||
#yield 'paccsld', PACCSLD(newLR()), lr_params
|
||||
yield 'hdysld', OneVsAll(HDySLD(newLR())), lr_params # <-- promising!
|
||||
|
||||
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
#print(f'Running QuaNet in {device}')
|
||||
#yield 'quanet', QuaNet(PCALR(**newLR().get_params()), SAMPLE_SIZE, device=device), lr_params
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print(f'Result folder: {args.results}')
|
||||
np.random.seed(0)
|
||||
|
||||
optim_losses = ['mae']
|
||||
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN
|
||||
models = quantification_models()
|
||||
|
||||
results = Parallel(n_jobs=settings.N_JOBS)(
|
||||
delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
|
||||
)
|
||||
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
import quapy as qp
|
||||
import numpy as np
|
||||
from os import makedirs
|
||||
import sys, os
|
||||
import pickle
|
||||
from experiments import result_path
|
||||
from gen_tables import save_table, experiment_errors
|
||||
from tabular import Table
|
||||
import argparse
|
||||
|
||||
tables_path = './tables'
|
||||
MAXTONE = 50 # sets the intensity of the maximum color reached by the worst (red) and best (green) results
|
||||
|
||||
makedirs(tables_path, exist_ok=True)
|
||||
|
||||
sample_size = 100
|
||||
qp.environ['SAMPLE_SIZE'] = sample_size
|
||||
|
||||
|
||||
nice = {
|
||||
'mae':'AE',
|
||||
'mrae':'RAE',
|
||||
'ae':'AE',
|
||||
'rae':'RAE',
|
||||
'svmkld': 'SVM(KLD)',
|
||||
'svmnkld': 'SVM(NKLD)',
|
||||
'svmq': 'SVM(Q)',
|
||||
'svmae': 'SVM(AE)',
|
||||
'svmnae': 'SVM(NAE)',
|
||||
'svmmae': 'SVM(AE)',
|
||||
'svmmrae': 'SVM(RAE)',
|
||||
'quanet': 'QuaNet',
|
||||
'hdy': 'HDy',
|
||||
'hdysld': 'HDy-SLD',
|
||||
'dys': 'DyS',
|
||||
'svmperf':'',
|
||||
'sanders': 'Sanders',
|
||||
'semeval13': 'SemEval13',
|
||||
'semeval14': 'SemEval14',
|
||||
'semeval15': 'SemEval15',
|
||||
'semeval16': 'SemEval16',
|
||||
'Average': 'Average'
|
||||
}
|
||||
|
||||
|
||||
def nicerm(key):
|
||||
return '\mathrm{'+nice[key]+'}'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Generate tables for Tweeter Sentiment Quantification')
|
||||
parser.add_argument('results', metavar='RESULT_PATH', type=str,
|
||||
help='path to the directory containing the results of the methods tested in Gao & Sebastiani')
|
||||
parser.add_argument('newresults', metavar='RESULT_PATH', type=str,
|
||||
help='path to the directory containing the results for the experimental methods')
|
||||
args = parser.parse_args()
|
||||
|
||||
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
|
||||
evaluation_measures = [qp.error.ae, qp.error.rae]
|
||||
gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
|
||||
new_methods = ['hdy'] # methods added to the Gao & Sebastiani methods
|
||||
experimental_methods = ['hdysld'] # experimental
|
||||
|
||||
for i, eval_func in enumerate(evaluation_measures):
|
||||
|
||||
# Tables evaluation scores for AE and RAE (two tables)
|
||||
# ----------------------------------------------------
|
||||
|
||||
eval_name = eval_func.__name__
|
||||
|
||||
added_methods = ['svmm' + eval_name] + new_methods
|
||||
methods = gao_seb_methods + added_methods + experimental_methods
|
||||
nold_methods = len(gao_seb_methods)
|
||||
nnew_methods = len(added_methods)
|
||||
nexp_methods = len(experimental_methods)
|
||||
|
||||
# fill data table
|
||||
table = Table(benchmarks=datasets, methods=methods)
|
||||
for dataset in datasets:
|
||||
for method in methods:
|
||||
if method in experimental_methods:
|
||||
path = args.newresults
|
||||
else:
|
||||
path = args.results
|
||||
table.add(dataset, method, experiment_errors(path, dataset, method, eval_name))
|
||||
|
||||
# write the latex table
|
||||
tabular = """
|
||||
\\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
|
||||
& \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} &
|
||||
\multicolumn{"""+str(nnew_methods)+"""}{c|}{} &
|
||||
\multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
|
||||
"""
|
||||
rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
|
||||
colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}
|
||||
|
||||
tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace)
|
||||
tabular += "\n\end{tabularx}"
|
||||
|
||||
save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
|
||||
|
||||
# Tables ranks for AE and RAE (two tables)
|
||||
# ----------------------------------------------------
|
||||
# fill the data table
|
||||
ranktable = Table(benchmarks=datasets, methods=methods, missing='--')
|
||||
for dataset in datasets:
|
||||
for method in methods:
|
||||
ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
|
||||
|
||||
# write the latex table
|
||||
tabular = """
|
||||
\\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
|
||||
& \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} &
|
||||
\multicolumn{"""+str(nnew_methods)+"""}{c|}{} &
|
||||
\multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
|
||||
"""
|
||||
for method in methods:
|
||||
tabular += ' & \side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} '
|
||||
tabular += '\\\\\hline\n'
|
||||
|
||||
for dataset in datasets:
|
||||
tabular += nice.get(dataset, dataset.upper()) + ' '
|
||||
for method in methods:
|
||||
newrank = ranktable.get(dataset, method)
|
||||
if newrank != '--':
|
||||
newrank = f'{int(newrank)}'
|
||||
color = ranktable.get_color(dataset, method)
|
||||
if color == '--':
|
||||
color = ''
|
||||
tabular += ' & ' + f'{newrank}' + color
|
||||
tabular += '\\\\\hline\n'
|
||||
tabular += '\hline\n'
|
||||
|
||||
tabular += 'Average '
|
||||
for method in methods:
|
||||
newrank = ranktable.get_average(method)
|
||||
if newrank != '--':
|
||||
newrank = f'{newrank:.1f}'
|
||||
color = ranktable.get_average(method, 'color')
|
||||
if color == '--':
|
||||
color = ''
|
||||
tabular += ' & ' + f'{newrank}' + color
|
||||
tabular += '\\\\\hline\n'
|
||||
tabular += "\end{tabularx}"
|
||||
|
||||
save_table(f'./tables/tab_rank_{eval_name}.new.tex', tabular)
|
||||
|
||||
print("[Done]")
|
|
@ -1,4 +0,0 @@
|
|||
import multiprocessing
|
||||
|
||||
N_JOBS = -2 #multiprocessing.cpu_count()
|
||||
SAMPLE_SIZE = 100
|
Loading…
Reference in New Issue