making everything work like in the wiki

2021-02-17 18:05:22 +01:00 · 2021-02-17 18:05:22 +01:00 · 854d759dc4
parent 70da8f7925
commit 854d759dc4
9 changed files with 80 additions and 70 deletions
--- a/NewMethods/new_gen_tables.py
+++ b/NewMethods/new_gen_tables.py
@ -75,7 +75,7 @@ if __name__ == '__main__':
        nexp_methods = len(experimental_methods)

        # fill data table
-        table = Table(rows=datasets, cols=methods)
+        table = Table(benchmarks=datasets, methods=methods)
        for dataset in datasets:
            for method in methods:
                if method in experimental_methods:
@ -94,7 +94,7 @@ if __name__ == '__main__':
        rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
        colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}

-        tabular += table.latexTabular(rowreplace=rowreplace, colreplace=colreplace)
+        tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace)
        tabular += "\n\end{tabularx}"

        save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
@ -102,7 +102,7 @@ if __name__ == '__main__':
        # Tables ranks for AE and RAE (two tables)
        # ----------------------------------------------------
        # fill the data table
-        ranktable = Table(rows=datasets, cols=methods, missing='--')
+        ranktable = Table(benchmarks=datasets, methods=methods, missing='--')
        for dataset in datasets:
            for method in methods:
                ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
--- a/TODO.txt
+++ b/TODO.txt
@ -1,20 +1,15 @@
 Documentation with sphinx
-Add quantification_report (akin to classification_report from sklearn) (?)
 Add NAE, NRAE
 Add "measures for evaluating ordinal"?
 Document methods with paper references
-The parallel training in svmperf seems not to work (not sure...)
 In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
 negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
 an instance of single-label with 2 labels. Check
 Add classnames to LabelledCollection ?
-Check the overhead in OneVsAll for SVMperf-based (?)
-Add HDy to QuaNet? if so, wrap HDy into OneVsAll in case the dataset is not binary.
+Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps)
 Add datasets for topic.
 Clarify whether QuaNet is an aggregative method or not.
-Add medium swap method
 Explore the hyperparameter "number of bins" in HDy
-Implement HDy for single-label?
 Rename EMQ to SLD ?
 How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
    to one always?
--- a/TweetSentQuant/tabular.py
+++ b/TweetSentQuant/tabular.py
@ -16,7 +16,8 @@ class Table:
        self.methods = np.asarray(methods)
        self.method_index = {col:j for j, col in enumerate(methods)}

-        self.map = {}  # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
+        self.map = {}  
+        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
        self._addmap('values', dtype=object)
        self.lower_is_better = lower_is_better
        self.ttest = ttest
@ -28,6 +29,7 @@ class Table:
        self.missing = missing
        self.missing_str = missing_str
        self.color = color
+        
        self.touch()

    @property
@ -39,10 +41,10 @@ class Table:
        return len(self.methods)

    def touch(self):
-        self.modif = True
+        self._modif = True

    def update(self):
-        if self.modif:
+        if self._modif:
            self.compute()

    def _getfilled(self):
@ -61,8 +63,6 @@ class Table:
            return
        m = self.map[map]
        f = func
-        if f is None:
-            return
        indexes = self._indexes() if map == 'fill' else self._getfilled()
        for i, j in indexes:
            m[i, j] = f(self.values[i, j])
@ -75,7 +75,7 @@ class Table:
            if not self.lower_is_better:
                ranked_cols_idx = ranked_cols_idx[::-1]
            self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
-
+            
    def _addcolor(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
@ -95,7 +95,6 @@ class Table:
                    normval = 1 - normval
                self.map['color'][i, col_idx] = color_red2green_01(normval)

-
    def _run_ttest(self, row, col1, col2):
        mean1 = self.map['mean'][row, col1]
        std1 = self.map['std'][row, col1]
@ -112,7 +111,7 @@ class Table:
        _, p_val = wilcoxon(values1, values2)
        return p_val

-    def _addttest(self):
+    def _add_statistical_test(self):
        if self.ttest is None:
            return
        self.some_similar = [False]*self.nmethods
@ -147,10 +146,10 @@ class Table:
        self._addmap('latex', dtype=object, func=None)
        self._addrank()
        self._addcolor()
-        self._addttest()
+        self._add_statistical_test()
        if self.add_average:
            self._addave()
-        self.modif = False
+        self._modif = False

    def _is_column_full(self, col):
        return all(self.map['fill'][:, self.method_index[col]])
@ -189,11 +188,11 @@ class Table:
        else:
            return self.missing

-    def _coordinates(self, row, col):
-        assert row in self.benchmark_index, f'row {row} out of range'
-        assert col in self.method_index, f'col {col} out of range'
-        rid = self.benchmark_index[row]
-        cid = self.method_index[col]
+    def _coordinates(self, benchmark, method):
+        assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
+        assert method in self.method_index, f'method {method} out of range'
+        rid = self.benchmark_index[benchmark]
+        cid = self.method_index[method]
        return rid, cid

    def get_average(self, method, attr='mean'):
@ -283,7 +282,6 @@ class Table:
        t.compute()
        return t

-
    def dropMethods(self, methods):
        drop_index = [self.method_index[m] for m in methods]
        new_methods = np.delete(self.methods, drop_index)
@ -295,8 +293,6 @@ class Table:
        self.touch()


-
-
 def pval_interpretation(p_val):
    if 0.005 >= p_val:
        return 'Diff'
--- a/quapy/classification/neural.py
+++ b/quapy/classification/neural.py
@ -32,7 +32,7 @@ class NeuralClassifierTrainer:
        super().__init__()

        assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}'
-        self.net = net
+        self.net = net.to(device)
        self.vocab_size = self.net.vocabulary_size
        self.trainer_hyperparams={
            'lr': lr,
@ -50,10 +50,12 @@ class NeuralClassifierTrainer:
        self.classes_ = np.asarray([0, 1])

        print(f'[NeuralNetwork running on {device}]')
+
        os.makedirs(Path(checkpointpath).parent, exist_ok=True)

    def reset_net_params(self, vocab_size, n_classes):
        self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams)
+        self.net = self.net.to(self.trainer_hyperparams['device'])
        self.net.xavier_uniform()

    def get_params(self):
@ -65,7 +67,7 @@ class NeuralClassifierTrainer:
        for key, val in params.items():
            if key in trainer_hyperparams and key in learner_hyperparams:
                raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to '
-                                 f'a parameters of the Trainer or the learner {self.netclass.__name__}')
+                                 f'a parameters of the Trainer or the learner {self.net.__name__}')
            elif key not in trainer_hyperparams and key not in learner_hyperparams:
                raise ValueError(f'parameter {key} is not valid')

@ -81,17 +83,7 @@ class NeuralClassifierTrainer:
    def device(self):
        return next(self.net.parameters()).device

-    def __update_progress_bar(self, pbar):
-        pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={self.current_epoch} '
-                             f'tr-loss={self.status["tr"]["loss"]:.5f} '
-                             f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
-                             f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
-                             f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
-                             f'val-loss={self.status["va"]["loss"]:.5f} '
-                             f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
-                             f'macroF1={100 * self.status["va"]["f1"]:.2f}%')
-
-    def _train_epoch(self, data, status, pbar):
+    def _train_epoch(self, data, status, pbar, epoch):
        self.net.train()
        criterion = torch.nn.CrossEntropyLoss()
        losses, predictions, true_labels = [], [], []
@ -109,9 +101,9 @@ class NeuralClassifierTrainer:
            true_labels.extend(yi.detach().cpu().numpy().tolist())
            status["acc"] = accuracy_score(true_labels, predictions)
            status["f1"] = f1_score(true_labels, predictions, average='macro')
-            self.__update_progress_bar(pbar)
+            self.__update_progress_bar(pbar, epoch)

-    def _test_epoch(self, data, status, pbar):
+    def _test_epoch(self, data, status, pbar, epoch):
        self.net.eval()
        criterion = torch.nn.CrossEntropyLoss()
        losses, predictions, true_labels = [], [], []
@ -127,7 +119,17 @@ class NeuralClassifierTrainer:
            status["loss"] = np.mean(losses)
            status["acc"] = accuracy_score(true_labels, predictions)
            status["f1"] = f1_score(true_labels, predictions, average='macro')
-            self.__update_progress_bar(pbar)
+            self.__update_progress_bar(pbar, epoch)
+
+    def __update_progress_bar(self, pbar, epoch):
+        pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={epoch} '
+                             f'tr-loss={self.status["tr"]["loss"]:.5f} '
+                             f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
+                             f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
+                             f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
+                             f'val-loss={self.status["va"]["loss"]:.5f} '
+                             f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
+                             f'macroF1={100 * self.status["va"]["f1"]:.2f}%')

    def fit(self, instances, labels, val_split=0.3):
        train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
@ -147,11 +149,11 @@ class NeuralClassifierTrainer:
        self.early_stop = EarlyStop(opt['patience'], lower_is_better=False)

        with tqdm(range(1, opt['epochs'] + 1)) as pbar:
-            for self.current_epoch in pbar:
-                self._train_epoch(train_generator, self.status['tr'], pbar)
-                self._test_epoch(valid_generator, self.status['va'], pbar)
+            for epoch in pbar:
+                self._train_epoch(train_generator, self.status['tr'], pbar, epoch)
+                self._test_epoch(valid_generator, self.status['va'], pbar, epoch)

-                self.early_stop(self.status['va']['f1'], self.current_epoch)
+                self.early_stop(self.status['va']['f1'], epoch)
                if self.early_stop.IMPROVED:
                    torch.save(self.net.state_dict(), checkpoint)
                elif self.early_stop.STOP:
@ -161,7 +163,7 @@ class NeuralClassifierTrainer:
                    break

        print('performing one training pass over the validation set...')
-        self._train_epoch(valid_generator, self.status['tr'], pbar)
+        self._train_epoch(valid_generator, self.status['tr'], pbar, epoch=0)
        print('[done]')

        return self
@ -170,9 +172,6 @@ class NeuralClassifierTrainer:
        return np.argmax(self.predict_proba(instances), axis=-1)

    def predict_proba(self, instances):
-        return self.net.predict_proba(instances)
-
-    def predict_probability_positive(self, instances):
        self.net.eval()
        opt = self.trainer_hyperparams
        with torch.no_grad():
@ -185,9 +184,10 @@ class NeuralClassifierTrainer:
    def transform(self, instances):
        self.net.eval()
        embeddings = []
+        opt = self.trainer_hyperparams
        with torch.no_grad():
            for xi in TorchDataset(instances).asDataloader(
-                    self.batch_size_test, shuffle=False, pad_length=self.padding_length, device=self.device):
+                    opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']):
                embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy())
        return np.concatenate(embeddings)

@ -233,7 +233,7 @@ class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta):

    def predict_proba(self, x):
        logits = self(x)
-        return torch.softmax(logits).detach().cpu().numpy()
+        return torch.softmax(logits, dim=1).detach().cpu().numpy()

    def xavier_uniform(self):
        for p in self.parameters():
--- a/quapy/classification/svmperf.py
+++ b/quapy/classification/svmperf.py
@ -1,9 +1,10 @@
 import random
 import subprocess
 import tempfile
-from os import remove
+from os import remove, makedirs
 from os.path import join, exists
 from subprocess import PIPE, STDOUT
+import shutil

 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
@ -22,7 +23,6 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        self.verbose = verbose
        self.loss = loss

-
    def set_params(self, **parameters):
        assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
        self.C = parameters['C']
@ -42,10 +42,15 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        local_random = random.Random()
        # this would allow to run parallel instances of predict
        random_code = '-'.join(str(local_random.randint(0,1000000)) for _ in range(5))
-        self.tmpdir = tempfile.TemporaryDirectory(suffix=random_code)
+        # self.tmpdir = tempfile.TemporaryDirectory(suffix=random_code)
+        # tmp dir are removed after the fit terminates in multiprocessing... moving to regular directories + __del__
+        self.tmpdir = '.svmperf-' + random_code
+        makedirs(self.tmpdir, exist_ok=True)

-        self.model = join(self.tmpdir.name, 'model-'+random_code)
-        traindat = join(self.tmpdir.name, f'train-{random_code}.dat')
+        # self.model = join(self.tmpdir.name, 'model-'+random_code)
+        # traindat = join(self.tmpdir.name, f'train-{random_code}.dat')
+        self.model = join(self.tmpdir, 'model-'+random_code)
+        traindat = join(self.tmpdir, f'train-{random_code}.dat')

        dump_svmlight_file(X, y, traindat, zero_based=False)

@ -75,8 +80,10 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        # in order to allow for parallel runs of predict, a random code is assigned
        local_random = random.Random()
        random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5))
-        predictions_path = join(self.tmpdir.name, 'predictions'+random_code+'.dat')
-        testdat = join(self.tmpdir.name, 'test'+random_code+'.dat')
+        # predictions_path = join(self.tmpdir.name, 'predictions'+random_code+'.dat')
+        # testdat = join(self.tmpdir.name, 'test'+random_code+'.dat')
+        predictions_path = join(self.tmpdir, 'predictions' + random_code + '.dat')
+        testdat = join(self.tmpdir, 'test' + random_code + '.dat')
        dump_svmlight_file(X, y, testdat, zero_based=False)

        cmd = ' '.join([self.svmperf_classify, testdat, self.model, predictions_path])
@ -93,4 +100,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):

        return scores

+    def __del__(self):
+        if hasattr(self, 'tmpdir'):
+            shutil.rmtree(self.tmpdir)

--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@ -149,7 +149,7 @@ class IndexTransformer:

    def index(self, documents):
        vocab = self.vocabulary_.copy()
-        return [[vocab.getscore(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
+        return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]

    def fit_transform(self, X, n_jobs=-1):
        return self.fit(X).transform(X, n_jobs=n_jobs)
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -580,7 +580,9 @@ class OneVsAll(AggregativeQuantifier):

    def __parallel(self, func, *args, **kwargs):
        return np.asarray(
-            Parallel(n_jobs=self.n_jobs)(
+            # some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
+            # create during the fit will be removed and be no longer available for the predict...
+            Parallel(n_jobs=self.n_jobs, backend='threading')(
                delayed(func)(c, *args, **kwargs) for c in self.classes
            )
        )
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -1,5 +1,7 @@
 from copy import deepcopy
 from typing import Union
+
+from sklearn.metrics import f1_score, make_scorer, accuracy_score
 from tqdm import tqdm

 import numpy as np
@ -267,7 +269,11 @@ def _instantiate_ensemble(learner, base_quantifier_class, param_grid, optim, par
    if optim is None:
        base_quantifier = base_quantifier_class(learner)
    elif optim in qp.error.CLASSIFICATION_ERROR:
-        learner = GridSearchCV(learner, param_grid)
+        if optim == qp.error.f1e:
+            scoring = make_scorer(f1_score)
+        elif optim == qp.error.acce:
+            scoring = make_scorer(accuracy_score)
+        learner = GridSearchCV(learner, param_grid, scoring=scoring)
        base_quantifier = base_quantifier_class(learner)
    else:
        base_quantifier = GridSearchQ(base_quantifier_class(learner),
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@ -79,13 +79,6 @@ class QuaNetTrainer(BaseQuantifier):
        self.tr_prev = data.prevalence()

        self.learner.fit(*classifier_data.Xy)
-        self.quantifiers = {
-            'cc': CC(self.learner).fit(classifier_data, fit_learner=False),
-            'acc': ACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
-            'pcc': PCC(self.learner).fit(classifier_data, fit_learner=False),
-            'pacc': PACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
-            'emq': EMQ(self.learner).fit(classifier_data, fit_learner=False),
-        }

        # compute the posterior probabilities of the instances
        valid_posteriors = self.learner.predict_proba(valid_data.instances)
@ -95,6 +88,14 @@ class QuaNetTrainer(BaseQuantifier):
        valid_data.instances = self.learner.transform(valid_data.instances)
        train_data.instances = self.learner.transform(train_data.instances)

+        self.quantifiers = {
+            'cc': CC(self.learner).fit(classifier_data, fit_learner=False),
+            'acc': ACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
+            'pcc': PCC(self.learner).fit(classifier_data, fit_learner=False),
+            'pacc': PACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
+            'emq': EMQ(self.learner).fit(classifier_data, fit_learner=False),
+        }
+
        self.status = {
            'tr-loss': -1,
            'va-loss': -1,