more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

2021-01-27 22:49:54 +01:00 · 2021-01-27 22:49:54 +01:00 · 1d89301089
parent e609c262b4
commit 1d89301089
7 changed files with 108 additions and 32 deletions
--- a/TweetSentQuant/evaluate_results.py
+++ b/TweetSentQuant/evaluate_results.py
@ -0,0 +1,28 @@
+import quapy as qp
+import settings
+import os
+import pickle
+from glob import glob
+import itertools
+import pathlib
+
+qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
+
+resultdir = './results'
+methods = ['*']
+
+def evaluate_results(methods, datasets, error_name):
+    results_str = []
+    error = qp.error.from_name(error_name)
+    for method, dataset in itertools.product(methods, datasets):
+        for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
+            true_prevalences, estim_prevalences, tr_prev, te_prev, te_prev_estim, best_params = \
+                pickle.load(open(experiment, 'rb'))
+            result = error(true_prevalences, estim_prevalences)
+            string = f'{pathlib.Path(experiment).name}: {result:.3f}'
+            results_str.append(string)
+    results_str = sorted(results_str)
+    for r in results_str:
+        print(r)
+
+evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
--- a/TweetSentQuant/gen_plots.py
+++ b/TweetSentQuant/gen_plots.py
@ -10,6 +10,7 @@ from os.path import join


 qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
+plotext='png'

 resultdir = './results'
 plotdir = './plots'
@ -30,7 +31,7 @@ def gather_results(methods, error_name):
 def plot_error_by_drift(methods, error_name, logscale=False, path=None):
    print('plotting error by drift')
    if path is not None:
-        path = join(path, f'error_by_drift_{error_name}.pdf')
+        path = join(path, f'error_by_drift_{error_name}.{plotext}')
    method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
    qp.plot.error_by_drift(
        method_names,
@ -51,9 +52,9 @@ def diagonal_plot(methods, error_name, path=None):
    if path is not None:
        path = join(path, f'diag_{error_name}')
    method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, show_std=False, savepath=path+'_neg.pdf')
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral',  legend=False, show_std=False, savepath=path+'_neu.pdf')
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, show_std=False, savepath=path+'_pos.pdf')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, show_std=False, savepath=f'{path}_neg.{plotext}')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral',  legend=False, show_std=False, savepath=f'{path}_neu.{plotext}')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, show_std=False, savepath=f'{path}_pos.{plotext}')


 def binary_bias_global(methods, error_name, path=None):
@ -61,9 +62,9 @@ def binary_bias_global(methods, error_name, path=None):
    if path is not None:
        path = join(path, f'globalbias_{error_name}')
    method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', savepath=path+'_neg.pdf')
-    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', savepath=path+'_neu.pdf')
-    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', savepath=path+'_pos.pdf')
+    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', savepath=f'{path}_neg.{plotext}')
+    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', savepath=f'{path}_neu.{plotext}')
+    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', savepath=f'{path}_pos.{plotext}')


 def binary_bias_bins(methods, error_name, path=None):
@ -71,24 +72,24 @@ def binary_bias_bins(methods, error_name, path=None):
    if path is not None:
        path = join(path, f'localbias_{error_name}')
    method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, savepath=path+'_neg.pdf')
-    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, savepath=path+'_neu.pdf')
-    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, savepath=path+'_pos.pdf')
+    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, savepath=f'{path}_neg.{plotext}')
+    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, savepath=f'{path}_neu.{plotext}')
+    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, savepath=f'{path}_pos.{plotext}')


 gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
 new_methods_ae = ['svmmae' , 'epaccmaeptr', 'epaccmaemae', 'hdy', 'quanet']
 new_methods_rae = ['svmmrae' , 'epaccmraeptr', 'epaccmraemrae', 'hdy', 'quanet']

-# plot_error_by_drift(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
-# plot_error_by_drift(gao_seb_methods+new_methods_rae, error_name='rae', logscale=True, path=plotdir)
+plot_error_by_drift(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
+plot_error_by_drift(gao_seb_methods+new_methods_rae, error_name='rae', logscale=True, path=plotdir)

-# diagonal_plot(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
-# diagonal_plot(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
+diagonal_plot(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
+diagonal_plot(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)

 binary_bias_global(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
 binary_bias_global(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)

-# binary_bias_bins(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
-# binary_bias_bins(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
+#binary_bias_bins(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
+#binary_bias_bins(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)

--- a/TweetSentQuant/util.py
+++ b/TweetSentQuant/util.py
@ -1,3 +1,5 @@
+import numpy as np
+

 nice = {
    'mae':'AE',
--- a/quapy/init.py
+++ b/quapy/init.py
@ -10,6 +10,8 @@ from . import model_selection
 from . import classification
 from quapy.method.base import isprobabilistic, isaggregative

+__version__ = '0.1'
+
 environ = {
    'SAMPLE_SIZE': None,
    'UNK_TOKEN': '[UNK]',
@ -18,6 +20,5 @@ environ = {
    'PAD_INDEX': 1,
 }

-
 def isbinary(x):
    return x.binary
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -148,7 +148,11 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                'pageblocks.5',
                #'phoneme', # <-- I haven't found this one...
                'semeion',
-                'sonar'] # ongoing...
+                'sonar',
+                'spambase',
+                'spectf',
+                'tictactoe',
+                'transfusion'] # ongoing...

 def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):

@ -180,8 +184,11 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'mammographic': 'Mammographic Mass',
        'pageblocks.5': 'Page Blocks Classification (5)',
        'semeion': 'Semeion Handwritten Digit (8)',
-        'sonar': 'Sonar, Mines vs. Rocks'
-
+        'sonar': 'Sonar, Mines vs. Rocks',
+        'spambase': 'Spambase Data Set',
+        'spectf': 'SPECTF Heart Data',
+        'tictactoe': 'Tic-Tac-Toe Endgame Database',
+        'transfusion': 'Blood Transfusion Service Center Data Set '
    }

    # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@ -208,8 +215,11 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'mammographic': 'mammographic-masses',
        'pageblocks.5': 'page-blocks',
        'semeion': 'semeion',
-        'sonar': 'undocumented/connectionist-bench/sonar'
-
+        'sonar': 'undocumented/connectionist-bench/sonar',
+        'spambase': 'spambase',
+        'spectf': 'spect',
+        'tictactoe': 'tic-tac-toe',
+        'transfusion': 'blood-transfusion'
    }

    # the filename is the name of the file within the data_folder indexed by the identifier
@ -219,7 +229,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        'statlog/german': 'german.data-numeric',
        'mammographic-masses': 'mammographic_masses.data',
        'page-blocks': 'page-blocks.data.Z',
-        'undocumented/connectionist-bench/sonar': 'sonar.all-data'
+        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
+        'spect': ['SPECTF.train', 'SPECTF.test'],
+        'blood-transfusion': 'transfusion.data'
    }

    # the filename containing the dataset description (if any)
@ -228,7 +240,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
        '00193': None,
        'statlog/german': 'german.doc',
        'mammographic-masses': 'mammographic_masses.names',
-        'undocumented/connectionist-bench/sonar': 'sonar.names'
+        'undocumented/connectionist-bench/sonar': 'sonar.names',
+        'spect': 'SPECTF.names',
+        'blood-transfusion': 'transfusion.names'
    }

    identifier = identifier_map[dataset_name]
@ -238,8 +252,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3

    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
    data_dir = join(data_home, 'uci_datasets', identifier)
-    data_path = join(data_dir, filename)
-    download_file_if_not_exists(f'{URL}/{filename}', data_path)
+    if isinstance(filename, str):  # filename could be a list of files, in which case it will be processed later
+        data_path = join(data_dir, filename)
+        download_file_if_not_exists(f'{URL}/{filename}', data_path)

    if descfile:
        try:
@ -368,11 +383,38 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3

    if identifier == 'undocumented/connectionist-bench/sonar':
        df = pd.read_csv(data_path, header=None, sep=',')
-        print(df)
        X = df.iloc[:, 0:60].astype(float).values
-        y = df[60].values 
+        y = df[60].values
        y = binarize(y, pos_class='R')

+    if identifier == 'spambase':
+        df = pd.read_csv(data_path, header=None, sep=',')
+        X = df.iloc[:, 0:57].astype(float).values
+        y = df[57].values
+        y = binarize(y, pos_class=1)
+
+    if identifier == 'spect':
+        dfs = []
+        for file in  filename:
+            data_path = join(data_dir, file)
+            download_file_if_not_exists(f'{URL}/{filename}', data_path)
+            dfs.append(pd.read_csv(data_path, header=None, sep=','))
+        df = pd.concat(dfs)
+        X = df.iloc[:, 1:45].astype(float).values
+        y = df[0].values
+        y = binarize(y, pos_class=0)
+
+    if identifier == 'tic-tac-toe':
+        df = pd.read_csv(data_path, header=None, sep=',')
+        X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
+        y = df[9].values
+        y = binarize(y, pos_class='negative')
+
+    if identifier == 'blood-transfusion':
+        df = pd.read_csv(data_path, sep=',')
+        X = df.iloc[:, 0:4].astype(float).values
+        y = df.iloc[:, 4].values
+        y = binarize(y, pos_class=1)

    data = LabelledCollection(X, y)
    data.stats()
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -5,9 +5,11 @@ import numpy as np
 from matplotlib import cm

 import quapy as qp
+from matplotlib.font_manager import FontProperties

 plt.rcParams['figure.figsize'] = [12, 8]
 plt.rcParams['figure.dpi'] = 200
+plt.rcParams['font.size'] = 16


 def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, savepath=None):
@ -44,11 +46,11 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No


 def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
+    method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
+
    fig, ax = plt.subplots()
    ax.grid()

-    method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
-
    data, labels = [], []
    for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
        true_prev = true_prev[:,pos_class]
--- a/test.py
+++ b/test.py
@ -12,8 +12,8 @@ from classification.neural import NeuralClassifierTrainer, CNNnet
 from method.meta import EPACC
 from quapy.model_selection import GridSearchQ

-# dataset = qp.datasets.fetch_UCIDataset('sonar', verbose=True)
-# sys.exit(0)
+dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
+sys.exit(0)


 qp.environ['SAMPLE_SIZE'] = 500