cleaning

2021-03-10 13:21:17 +01:00 · 2021-03-10 13:21:17 +01:00 · 1aafd10e25
parent 168c109794
commit 1aafd10e25
10 changed files with 0 additions and 374 deletions
--- a/NewMethods/fgsld/init.py
+++ b/NewMethods/fgsld/init.py
--- a/NewMethods/fgsld/em.py
+++ b/NewMethods/fgsld/em.py
--- a/NewMethods/fgsld/fglsd_test.py
+++ b/NewMethods/fgsld/fglsd_test.py
--- a/NewMethods/fgsld/fine_grained_sld.py
+++ b/NewMethods/fgsld/fine_grained_sld.py
--- a/NewMethods/fgsld/metrics.py
+++ b/NewMethods/fgsld/metrics.py
--- a/NewMethods/fgsld/plot_fglsd.png
+++ b/NewMethods/fgsld/plot_fglsd.png
--- a/NewMethods/methods.py
+++ b/NewMethods/methods.py
@ -1,174 +0,0 @@
-import numpy as np
-from sklearn.base import BaseEstimator
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
-
-import quapy as qp
-from typing import Union
-
-from quapy.data import LabelledCollection
-from quapy.method.base import BaseQuantifier, BinaryQuantifier
-from quapy.method.aggregative import PACC, EMQ, HDy
-import quapy.functional as F
-from tqdm import tqdm
-from scipy.sparse import issparse, csr_matrix
-import scipy
-
-
-class PACCSLD(PACC):
-    """
-    This method combines the EMQ improved posterior probabilities with PACC.
-    Note: the posterior probabilities are re-calibrated with EMQ only during prediction, and not also during fit since,
-    for PACC, the validation split is known to have the same prevalence as the training set (this is because the split
-    is stratified) and thus the posterior probabilities should not be re-calibrated for a different prior (it actually
-    happens to degrades performance).
-    """
-
-    def fit(self, data: qp.data.LabelledCollection, fit_learner=True, val_split:Union[float, int, qp.data.LabelledCollection]=0.4):
-        self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes)
-        return super(PACCSLD, self).fit(data, fit_learner, val_split)
-
-    def aggregate(self, classif_posteriors):
-        priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
-        return super(PACCSLD, self).aggregate(posteriors)
-
-
-class HDySLD(HDy):
-    """
-        This method combines the EMQ improved posterior probabilities with HDy.
-        Note: [same as PACCSLD]
-        """
-    def fit(self, data: qp.data.LabelledCollection, fit_learner=True,
-            val_split: Union[float, int, qp.data.LabelledCollection] = 0.4):
-        self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes)
-        return super(HDySLD, self).fit(data, fit_learner, val_split)
-
-    def aggregate(self, classif_posteriors):
-        priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
-        return super(HDySLD, self).aggregate(posteriors)
-
-
-
-class AveragePoolQuantification(BinaryQuantifier):
-    def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False):
-        self.learner = learner
-        self.sample_size = sample_size
-        self.trials = trials
-
-        self.do_zscore = zscore
-        self.zscore = StandardScaler() if self.do_zscore else None
-
-        self.do_pca = n_components>0
-        self.pca = PCA(n_components) if self.do_pca else None
-
-    def fit(self, data: LabelledCollection):
-        training, validation = data.split_stratified(train_prop=0.7)
-
-        X, y = [], []
-
-        nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes)
-        for sample in tqdm(
-                training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1),
-                desc='generating averages'
-        ):
-            X.append(sample.instances.mean(axis=0))
-            y.append(sample.prevalence()[1])
-        while len(X) < self.trials:
-            sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes))
-            X.append(sample.instances.mean(axis=0))
-            y.append(sample.prevalence())
-        X = np.asarray(np.vstack(X))
-        y = np.asarray(y)
-
-        if self.do_pca:
-            X = self.pca.fit_transform(X)
-            print(X.shape)
-
-        if self.do_zscore:
-            X = self.zscore.fit_transform(X)
-
-        print('training regressor...')
-        self.regressor = self.learner.fit(X, y)
-
-        # correction at 0:
-        print('getting corrections...')
-        X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
-        X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
-
-        if self.do_pca:
-            X0 = self.pca.transform(X0)
-            X1 = self.pca.transform(X1)
-
-        if self.do_zscore:
-            X0 = self.zscore.transform(X0)
-            X1 = self.zscore.transform(X1)
-
-        self.correction_0 = self.regressor.predict(X0).mean()
-        self.correction_1 = self.regressor.predict(X1).mean()
-
-        print('correction-0', self.correction_0)
-        print('correction-1', self.correction_1)
-        print('done')
-
-    def quantify(self, instances):
-        ave = np.asarray(instances.mean(axis=0))
-
-        if self.do_pca:
-            ave = self.pca.transform(ave)
-        if self.do_zscore:
-            ave = self.zscore.transform(ave)
-        phat = self.regressor.predict(ave).item()
-        phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1)
-        return np.asarray([1-phat, phat])
-
-    def set_params(self, **parameters):
-        self.learner.set_params(**parameters)
-
-    def get_params(self, deep=True):
-        return self.learner.get_params(deep=deep)
-
-
-class WinnowOrthogonal(BaseEstimator):
-
-    def __init__(self):
-        pass
-
-    def fit(self, X, y):
-        self.classes_ = np.asarray(sorted(np.unique(y)))
-        w1 = np.asarray(X[y == 0].mean(axis=0)).flatten()
-        w2 = np.asarray(X[y == 1].mean(axis=0)).flatten()
-        diff = w2 - w1
-        orth = np.ones_like(diff)
-        orth[0] = -diff[1:].sum() / diff[0]
-        orth /= np.linalg.norm(orth)
-        self.w = orth
-        self.b = w1.dot(orth)
-        return self
-
-    def decision_function(self, X):
-        if issparse(X):
-            Z = X.dot(csr_matrix(self.w).T).toarray().flatten()
-            return Z - self.b
-        else:
-            return np.matmul(X, self.w) - self.b
-
-    def predict(self, X):
-        return 1 * (self.decision_function(X) > 0)
-
-    def split(self, X, y):
-        s = self.predict(X)
-        X0a = X[np.logical_and(y == 0, s == 0)]
-        X0b = X[np.logical_and(y == 0, s == 1)]
-        X1a = X[np.logical_and(y == 1, s == 0)]
-        X1b = X[np.logical_and(y == 1, s == 1)]
-        y0a = np.zeros(X0a.shape[0], dtype=np.int)
-        y0b = np.zeros(X0b.shape[0], dtype=np.int)
-        y1a = np.ones(X1a.shape[0], dtype=np.int)
-        y1b = np.ones(X1b.shape[0], dtype=np.int)
-        return X0a, X0b, X1a, X1b, y0a, y0b, y1a, y1b
-
-    def get_params(self):
-        return {}
-
-    def set_params(self, **params):
-        pass
--- a/NewMethods/new_experiments.py
+++ b/NewMethods/new_experiments.py
@ -1,48 +0,0 @@
-from sklearn.linear_model import LogisticRegression
-import quapy as qp
-from classification.methods import PCALR
-from method.meta import QuaNet
-from quapy.method.aggregative import *
-from NewMethods.methods import *
-from experiments import run, SAMPLE_SIZE
-import numpy as np
-import itertools
-from joblib import Parallel, delayed
-import settings
-import argparse
-import torch
-
-parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
-parser.add_argument('results', metavar='RESULT_PATH', type=str, help='path to the directory where to store the results')
-#parser.add_argument('svmperfpath', metavar='SVMPERF_PATH', type=str, help='path to the directory with svmperf')
-args = parser.parse_args()
-
-
-def quantification_models():
-    def newLR():
-        return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
-    __C_range = np.logspace(-4, 5, 10)
-    lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
-    svmperf_params = {'C': __C_range}
-    #yield 'paccsld', PACCSLD(newLR()), lr_params
-    yield 'hdysld', OneVsAll(HDySLD(newLR())), lr_params  # <-- promising!
-
-    #device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    #print(f'Running QuaNet in {device}')
-    #yield 'quanet', QuaNet(PCALR(**newLR().get_params()), SAMPLE_SIZE, device=device), lr_params
-
-
-if __name__ == '__main__':
-
-    print(f'Result folder: {args.results}')
-    np.random.seed(0)
-
-    optim_losses = ['mae']
-    datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN
-    models = quantification_models()
-
-    results = Parallel(n_jobs=settings.N_JOBS)(
-        delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
-    )
-
-
--- a/NewMethods/new_gen_tables.py
+++ b/NewMethods/new_gen_tables.py
@ -1,148 +0,0 @@
-import quapy as qp
-import numpy as np
-from os import makedirs
-import sys, os
-import pickle
-from experiments import result_path
-from gen_tables import save_table, experiment_errors
-from tabular import Table
-import argparse
-
-tables_path = './tables'
-MAXTONE = 50  # sets the intensity of the maximum color reached by the worst (red) and best (green) results
-
-makedirs(tables_path, exist_ok=True)
-
-sample_size = 100
-qp.environ['SAMPLE_SIZE'] = sample_size
-
-
-nice = {
-    'mae':'AE',
-    'mrae':'RAE',
-    'ae':'AE',
-    'rae':'RAE',
-    'svmkld': 'SVM(KLD)',
-    'svmnkld': 'SVM(NKLD)',
-    'svmq': 'SVM(Q)',
-    'svmae': 'SVM(AE)',
-    'svmnae': 'SVM(NAE)',
-    'svmmae': 'SVM(AE)',
-    'svmmrae': 'SVM(RAE)',
-    'quanet': 'QuaNet',
-    'hdy': 'HDy',
-    'hdysld': 'HDy-SLD',
-    'dys': 'DyS',
-    'svmperf':'',
-    'sanders': 'Sanders',
-    'semeval13': 'SemEval13',
-    'semeval14': 'SemEval14',
-    'semeval15': 'SemEval15',
-    'semeval16': 'SemEval16',
-    'Average': 'Average'
-}
-
-
-def nicerm(key):
-    return '\mathrm{'+nice[key]+'}'
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Generate tables for Tweeter Sentiment Quantification')
-    parser.add_argument('results', metavar='RESULT_PATH', type=str,
-                        help='path to the directory containing the results of the methods tested in Gao & Sebastiani')
-    parser.add_argument('newresults', metavar='RESULT_PATH', type=str,
-                        help='path to the directory containing the results for the experimental methods')
-    args = parser.parse_args()
-
-    datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
-    evaluation_measures = [qp.error.ae, qp.error.rae]
-    gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
-    new_methods = ['hdy']  # methods added to the Gao & Sebastiani methods
-    experimental_methods = ['hdysld']  # experimental
-
-    for i, eval_func in enumerate(evaluation_measures):
-
-        # Tables evaluation scores for AE and RAE (two tables)
-        # ----------------------------------------------------
-
-        eval_name = eval_func.__name__
-
-        added_methods = ['svmm' + eval_name] + new_methods
-        methods = gao_seb_methods + added_methods + experimental_methods
-        nold_methods = len(gao_seb_methods)
-        nnew_methods = len(added_methods)
-        nexp_methods = len(experimental_methods)
-
-        # fill data table
-        table = Table(benchmarks=datasets, methods=methods)
-        for dataset in datasets:
-            for method in methods:
-                if method in experimental_methods:
-                    path = args.newresults
-                else:
-                    path = args.results
-                table.add(dataset, method, experiment_errors(path, dataset, method, eval_name))
-
-        # write the latex table
-        tabular = """
-        \\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
-          & \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} & 
-            \multicolumn{"""+str(nnew_methods)+"""}{c|}{} & 
-            \multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
-        """
-        rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
-        colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}
-
-        tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace)
-        tabular += "\n\end{tabularx}"
-
-        save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
-
-        # Tables ranks for AE and RAE (two tables)
-        # ----------------------------------------------------
-        # fill the data table
-        ranktable = Table(benchmarks=datasets, methods=methods, missing='--')
-        for dataset in datasets:
-            for method in methods:
-                ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
-
-        # write the latex table
-        tabular = """
-        \\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
-              & \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} & 
-            \multicolumn{"""+str(nnew_methods)+"""}{c|}{} & 
-            \multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
-        """
-        for method in methods:
-            tabular += ' & \side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} '
-        tabular += '\\\\\hline\n'
-
-        for dataset in datasets:
-            tabular += nice.get(dataset, dataset.upper()) + ' '
-            for method in methods:
-                newrank = ranktable.get(dataset, method)
-                if newrank != '--':
-                    newrank = f'{int(newrank)}'
-                color = ranktable.get_color(dataset, method)
-                if color == '--':
-                    color = ''
-                tabular += ' & ' + f'{newrank}' + color
-            tabular += '\\\\\hline\n'
-        tabular += '\hline\n'
-
-        tabular += 'Average '
-        for method in methods:
-            newrank = ranktable.get_average(method)
-            if newrank != '--':
-                newrank = f'{newrank:.1f}'
-            color = ranktable.get_average(method, 'color')
-            if color == '--':
-                color = ''
-            tabular += ' & ' + f'{newrank}' + color
-        tabular += '\\\\\hline\n'
-        tabular += "\end{tabularx}"
-
-        save_table(f'./tables/tab_rank_{eval_name}.new.tex', tabular)
-
-    print("[Done]")
--- a/NewMethods/settings.py
+++ b/NewMethods/settings.py
@ -1,4 +0,0 @@
-import multiprocessing
-
-N_JOBS = -2  #multiprocessing.cpu_count()
-SAMPLE_SIZE = 100