From 1aafd10e2558d19558531261ecd6bed2a965565e Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 10 Mar 2021 13:21:17 +0100 Subject: [PATCH] cleaning --- NewMethods/fgsld/__init__.py | 0 NewMethods/fgsld/em.py | 0 NewMethods/fgsld/fglsd_test.py | 0 NewMethods/fgsld/fine_grained_sld.py | 0 NewMethods/fgsld/metrics.py | 0 NewMethods/fgsld/plot_fglsd.png | 0 NewMethods/methods.py | 174 --------------------------- NewMethods/new_experiments.py | 48 -------- NewMethods/new_gen_tables.py | 148 ----------------------- NewMethods/settings.py | 4 - 10 files changed, 374 deletions(-) delete mode 100644 NewMethods/fgsld/__init__.py delete mode 100644 NewMethods/fgsld/em.py delete mode 100644 NewMethods/fgsld/fglsd_test.py delete mode 100644 NewMethods/fgsld/fine_grained_sld.py delete mode 100644 NewMethods/fgsld/metrics.py delete mode 100644 NewMethods/fgsld/plot_fglsd.png delete mode 100644 NewMethods/methods.py delete mode 100644 NewMethods/new_experiments.py delete mode 100644 NewMethods/new_gen_tables.py delete mode 100644 NewMethods/settings.py diff --git a/NewMethods/fgsld/__init__.py b/NewMethods/fgsld/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/NewMethods/fgsld/em.py b/NewMethods/fgsld/em.py deleted file mode 100644 index e69de29..0000000 diff --git a/NewMethods/fgsld/fglsd_test.py b/NewMethods/fgsld/fglsd_test.py deleted file mode 100644 index e69de29..0000000 diff --git a/NewMethods/fgsld/fine_grained_sld.py b/NewMethods/fgsld/fine_grained_sld.py deleted file mode 100644 index e69de29..0000000 diff --git a/NewMethods/fgsld/metrics.py b/NewMethods/fgsld/metrics.py deleted file mode 100644 index e69de29..0000000 diff --git a/NewMethods/fgsld/plot_fglsd.png b/NewMethods/fgsld/plot_fglsd.png deleted file mode 100644 index e69de29..0000000 diff --git a/NewMethods/methods.py b/NewMethods/methods.py deleted file mode 100644 index b47927d..0000000 --- a/NewMethods/methods.py +++ /dev/null @@ -1,174 +0,0 @@ -import numpy as np -from sklearn.base import BaseEstimator -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler - -import quapy as qp -from typing import Union - -from quapy.data import LabelledCollection -from quapy.method.base import BaseQuantifier, BinaryQuantifier -from quapy.method.aggregative import PACC, EMQ, HDy -import quapy.functional as F -from tqdm import tqdm -from scipy.sparse import issparse, csr_matrix -import scipy - - -class PACCSLD(PACC): - """ - This method combines the EMQ improved posterior probabilities with PACC. - Note: the posterior probabilities are re-calibrated with EMQ only during prediction, and not also during fit since, - for PACC, the validation split is known to have the same prevalence as the training set (this is because the split - is stratified) and thus the posterior probabilities should not be re-calibrated for a different prior (it actually - happens to degrades performance). - """ - - def fit(self, data: qp.data.LabelledCollection, fit_learner=True, val_split:Union[float, int, qp.data.LabelledCollection]=0.4): - self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes) - return super(PACCSLD, self).fit(data, fit_learner, val_split) - - def aggregate(self, classif_posteriors): - priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4) - return super(PACCSLD, self).aggregate(posteriors) - - -class HDySLD(HDy): - """ - This method combines the EMQ improved posterior probabilities with HDy. - Note: [same as PACCSLD] - """ - def fit(self, data: qp.data.LabelledCollection, fit_learner=True, - val_split: Union[float, int, qp.data.LabelledCollection] = 0.4): - self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes) - return super(HDySLD, self).fit(data, fit_learner, val_split) - - def aggregate(self, classif_posteriors): - priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4) - return super(HDySLD, self).aggregate(posteriors) - - - -class AveragePoolQuantification(BinaryQuantifier): - def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False): - self.learner = learner - self.sample_size = sample_size - self.trials = trials - - self.do_zscore = zscore - self.zscore = StandardScaler() if self.do_zscore else None - - self.do_pca = n_components>0 - self.pca = PCA(n_components) if self.do_pca else None - - def fit(self, data: LabelledCollection): - training, validation = data.split_stratified(train_prop=0.7) - - X, y = [], [] - - nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes) - for sample in tqdm( - training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1), - desc='generating averages' - ): - X.append(sample.instances.mean(axis=0)) - y.append(sample.prevalence()[1]) - while len(X) < self.trials: - sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes)) - X.append(sample.instances.mean(axis=0)) - y.append(sample.prevalence()) - X = np.asarray(np.vstack(X)) - y = np.asarray(y) - - if self.do_pca: - X = self.pca.fit_transform(X) - print(X.shape) - - if self.do_zscore: - X = self.zscore.fit_transform(X) - - print('training regressor...') - self.regressor = self.learner.fit(X, y) - - # correction at 0: - print('getting corrections...') - X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)])) - X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)])) - - if self.do_pca: - X0 = self.pca.transform(X0) - X1 = self.pca.transform(X1) - - if self.do_zscore: - X0 = self.zscore.transform(X0) - X1 = self.zscore.transform(X1) - - self.correction_0 = self.regressor.predict(X0).mean() - self.correction_1 = self.regressor.predict(X1).mean() - - print('correction-0', self.correction_0) - print('correction-1', self.correction_1) - print('done') - - def quantify(self, instances): - ave = np.asarray(instances.mean(axis=0)) - - if self.do_pca: - ave = self.pca.transform(ave) - if self.do_zscore: - ave = self.zscore.transform(ave) - phat = self.regressor.predict(ave).item() - phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1) - return np.asarray([1-phat, phat]) - - def set_params(self, **parameters): - self.learner.set_params(**parameters) - - def get_params(self, deep=True): - return self.learner.get_params(deep=deep) - - -class WinnowOrthogonal(BaseEstimator): - - def __init__(self): - pass - - def fit(self, X, y): - self.classes_ = np.asarray(sorted(np.unique(y))) - w1 = np.asarray(X[y == 0].mean(axis=0)).flatten() - w2 = np.asarray(X[y == 1].mean(axis=0)).flatten() - diff = w2 - w1 - orth = np.ones_like(diff) - orth[0] = -diff[1:].sum() / diff[0] - orth /= np.linalg.norm(orth) - self.w = orth - self.b = w1.dot(orth) - return self - - def decision_function(self, X): - if issparse(X): - Z = X.dot(csr_matrix(self.w).T).toarray().flatten() - return Z - self.b - else: - return np.matmul(X, self.w) - self.b - - def predict(self, X): - return 1 * (self.decision_function(X) > 0) - - def split(self, X, y): - s = self.predict(X) - X0a = X[np.logical_and(y == 0, s == 0)] - X0b = X[np.logical_and(y == 0, s == 1)] - X1a = X[np.logical_and(y == 1, s == 0)] - X1b = X[np.logical_and(y == 1, s == 1)] - y0a = np.zeros(X0a.shape[0], dtype=np.int) - y0b = np.zeros(X0b.shape[0], dtype=np.int) - y1a = np.ones(X1a.shape[0], dtype=np.int) - y1b = np.ones(X1b.shape[0], dtype=np.int) - return X0a, X0b, X1a, X1b, y0a, y0b, y1a, y1b - - def get_params(self): - return {} - - def set_params(self, **params): - pass diff --git a/NewMethods/new_experiments.py b/NewMethods/new_experiments.py deleted file mode 100644 index d60b158..0000000 --- a/NewMethods/new_experiments.py +++ /dev/null @@ -1,48 +0,0 @@ -from sklearn.linear_model import LogisticRegression -import quapy as qp -from classification.methods import PCALR -from method.meta import QuaNet -from quapy.method.aggregative import * -from NewMethods.methods import * -from experiments import run, SAMPLE_SIZE -import numpy as np -import itertools -from joblib import Parallel, delayed -import settings -import argparse -import torch - -parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') -parser.add_argument('results', metavar='RESULT_PATH', type=str, help='path to the directory where to store the results') -#parser.add_argument('svmperfpath', metavar='SVMPERF_PATH', type=str, help='path to the directory with svmperf') -args = parser.parse_args() - - -def quantification_models(): - def newLR(): - return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) - __C_range = np.logspace(-4, 5, 10) - lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} - svmperf_params = {'C': __C_range} - #yield 'paccsld', PACCSLD(newLR()), lr_params - yield 'hdysld', OneVsAll(HDySLD(newLR())), lr_params # <-- promising! - - #device = 'cuda' if torch.cuda.is_available() else 'cpu' - #print(f'Running QuaNet in {device}') - #yield 'quanet', QuaNet(PCALR(**newLR().get_params()), SAMPLE_SIZE, device=device), lr_params - - -if __name__ == '__main__': - - print(f'Result folder: {args.results}') - np.random.seed(0) - - optim_losses = ['mae'] - datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN - models = quantification_models() - - results = Parallel(n_jobs=settings.N_JOBS)( - delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models) - ) - - diff --git a/NewMethods/new_gen_tables.py b/NewMethods/new_gen_tables.py deleted file mode 100644 index c6aeb7e..0000000 --- a/NewMethods/new_gen_tables.py +++ /dev/null @@ -1,148 +0,0 @@ -import quapy as qp -import numpy as np -from os import makedirs -import sys, os -import pickle -from experiments import result_path -from gen_tables import save_table, experiment_errors -from tabular import Table -import argparse - -tables_path = './tables' -MAXTONE = 50 # sets the intensity of the maximum color reached by the worst (red) and best (green) results - -makedirs(tables_path, exist_ok=True) - -sample_size = 100 -qp.environ['SAMPLE_SIZE'] = sample_size - - -nice = { - 'mae':'AE', - 'mrae':'RAE', - 'ae':'AE', - 'rae':'RAE', - 'svmkld': 'SVM(KLD)', - 'svmnkld': 'SVM(NKLD)', - 'svmq': 'SVM(Q)', - 'svmae': 'SVM(AE)', - 'svmnae': 'SVM(NAE)', - 'svmmae': 'SVM(AE)', - 'svmmrae': 'SVM(RAE)', - 'quanet': 'QuaNet', - 'hdy': 'HDy', - 'hdysld': 'HDy-SLD', - 'dys': 'DyS', - 'svmperf':'', - 'sanders': 'Sanders', - 'semeval13': 'SemEval13', - 'semeval14': 'SemEval14', - 'semeval15': 'SemEval15', - 'semeval16': 'SemEval16', - 'Average': 'Average' -} - - -def nicerm(key): - return '\mathrm{'+nice[key]+'}' - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Generate tables for Tweeter Sentiment Quantification') - parser.add_argument('results', metavar='RESULT_PATH', type=str, - help='path to the directory containing the results of the methods tested in Gao & Sebastiani') - parser.add_argument('newresults', metavar='RESULT_PATH', type=str, - help='path to the directory containing the results for the experimental methods') - args = parser.parse_args() - - datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST - evaluation_measures = [qp.error.ae, qp.error.rae] - gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld'] - new_methods = ['hdy'] # methods added to the Gao & Sebastiani methods - experimental_methods = ['hdysld'] # experimental - - for i, eval_func in enumerate(evaluation_measures): - - # Tables evaluation scores for AE and RAE (two tables) - # ---------------------------------------------------- - - eval_name = eval_func.__name__ - - added_methods = ['svmm' + eval_name] + new_methods - methods = gao_seb_methods + added_methods + experimental_methods - nold_methods = len(gao_seb_methods) - nnew_methods = len(added_methods) - nexp_methods = len(experimental_methods) - - # fill data table - table = Table(benchmarks=datasets, methods=methods) - for dataset in datasets: - for method in methods: - if method in experimental_methods: - path = args.newresults - else: - path = args.results - table.add(dataset, method, experiment_errors(path, dataset, method, eval_name)) - - # write the latex table - tabular = """ - \\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline - & \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} & - \multicolumn{"""+str(nnew_methods)+"""}{c|}{} & - \multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline - """ - rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets} - colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods} - - tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace) - tabular += "\n\end{tabularx}" - - save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular) - - # Tables ranks for AE and RAE (two tables) - # ---------------------------------------------------- - # fill the data table - ranktable = Table(benchmarks=datasets, methods=methods, missing='--') - for dataset in datasets: - for method in methods: - ranktable.add(dataset, method, values=table.get(dataset, method, 'rank')) - - # write the latex table - tabular = """ - \\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline - & \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} & - \multicolumn{"""+str(nnew_methods)+"""}{c|}{} & - \multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline - """ - for method in methods: - tabular += ' & \side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' - tabular += '\\\\\hline\n' - - for dataset in datasets: - tabular += nice.get(dataset, dataset.upper()) + ' ' - for method in methods: - newrank = ranktable.get(dataset, method) - if newrank != '--': - newrank = f'{int(newrank)}' - color = ranktable.get_color(dataset, method) - if color == '--': - color = '' - tabular += ' & ' + f'{newrank}' + color - tabular += '\\\\\hline\n' - tabular += '\hline\n' - - tabular += 'Average ' - for method in methods: - newrank = ranktable.get_average(method) - if newrank != '--': - newrank = f'{newrank:.1f}' - color = ranktable.get_average(method, 'color') - if color == '--': - color = '' - tabular += ' & ' + f'{newrank}' + color - tabular += '\\\\\hline\n' - tabular += "\end{tabularx}" - - save_table(f'./tables/tab_rank_{eval_name}.new.tex', tabular) - - print("[Done]") diff --git a/NewMethods/settings.py b/NewMethods/settings.py deleted file mode 100644 index 2ade31a..0000000 --- a/NewMethods/settings.py +++ /dev/null @@ -1,4 +0,0 @@ -import multiprocessing - -N_JOBS = -2 #multiprocessing.cpu_count() -SAMPLE_SIZE = 100 \ No newline at end of file