refactoring everything

2024-04-18 09:32:30 +02:00 · 2024-04-18 09:32:30 +02:00 · 985f430d52
parent 8399552c8d
commit 985f430d52
8 changed files with 74 additions and 557 deletions
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -8,50 +8,28 @@ from quapy.protocol import AbstractProtocol
 import json
-def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+def load_sample(path, class_name, max_lines=-1):
-    # print('reading', path)
+    """
-    if verbose:
+    Loads a sample json as a dataframe and returns text and labels for
-        print(f'loading {path}...', end='')
+    the given class_name
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['continent'].values
-    if parse_columns:
+    :param path: path to a json file
-        rank = df['rank'].values
+    :param class_name: string representing the target class
-        scores = df['score'].values
+    :param max_lines: if provided and > 0 then returns only the
-        rank = rank[y != 'Antarctica']
+        first requested number of instances
-        scores = scores[y != 'Antarctica']
+    :return: texts and labels for class_name
-
+    """
-    X = X[y!='Antarctica']
+    df = pd.read_json(path)
-    y = y[y!='Antarctica']
+    text = df.text.values
-
+    try:
-    if parse_columns:
+        labels = df[class_name].values
-        order = np.argsort(rank)
+    except KeyError as e:
-        X = X[order]
+        print(f'error in {path}; key {class_name} not found')
-        y = y[order]
+        raise e
        rank = rank[order]
        scores = scores[order]
    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]
    return X, y
 def load_json_sample(path, class_name, max_lines=-1):
    obj = json.load(open(path, 'rt'))
    keys = [f'{id}' for id in range(len(obj['text'].keys()))]
    text = [obj['text'][id] for id in keys]
    #print(list(obj.keys()))
    #import sys; sys.exit(0)
    classes = [obj[class_name][id] for id in keys]
    if max_lines is not None and max_lines>0:
        text = text[:max_lines]
-        classes = classes[:max_lines]
+        labels = labels[:max_lines]
-    return text, classes
+    return text, labels
 class TextRankings:
@ -75,49 +53,81 @@ class TextRankings:
        return texts, labels
-def get_query_id_from_path(path, prefix='training', posfix='200SPLIT'):
+def filter_by_classes(X, y, classes):
-    qid = path
+    idx = np.isin(y, classes)
-    qid = qid[:qid.index(posfix)]
+    return X[idx], y[idx]
    qid = qid[qid.index(prefix)+len(prefix):]
    return qid
 class RetrievedSamples(AbstractProtocol):
-    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None, class_name=None):
+    def __init__(self,
-        self.path_dir = path_dir
+                 class_home: str,
                 test_rankings_path: str,
                 load_fn,
                 vectorizer,
                 class_name,
                 max_train_lines=None,
                 max_test_lines=None,
                 classes=None
                 ):
        self.class_home = class_home
        self.test_rankings_df = pd.read_json(test_rankings_path)
        self.load_fn = load_fn
        self.vectorizer = vectorizer
        self.class_name = class_name
        self.max_train_lines = max_train_lines
        self.max_test_lines = max_test_lines
        self.classes=classes
        assert class_name is not None, 'class name should be specified'
        self.class_name = class_name
        self.text_samples = TextRankings(join(self.path_dir, 'testRankingsRetrieval.json'), class_name=class_name)
    def __call__(self):
-        for file in glob(join(self.path_dir, 'training*SPLIT.json')):
+        for file in self._list_queries():
-            X, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
+            texts, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
-            X = self.vectorizer.transform(X)
+            texts, y = filter_by_classes(texts, y, self.classes)
            X = self.vectorizer.transform(texts)
            train_sample = LabelledCollection(X, y, classes=self.classes)
-            query_id = get_query_id_from_path(file)
+            query_id = self._get_query_id_from_path(file)
-            X, y = self.text_samples.get_sample_Xy(query_id, max_lines=self.max_test_lines)
+            texts, y = self._get_test_sample(query_id, max_lines=self.max_test_lines)
            texts, y = filter_by_classes(texts, y, self.classes)
            X = self.vectorizer.transform(texts)
            # if len(X)!=qp.environ['SAMPLE_SIZE']:
            #     print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
            X = self.vectorizer.transform(X)
            try:
                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
                yield train_sample, test_sample
            except ValueError as e:
-                print(f'file {file} caused error {e}')
+                print(f'file {file} caused an exception: {e}')
                yield None, None
            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
-            yield train_sample, test_sample
+    def _list_queries(self):
        return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
    def _get_test_sample(self, query_id, max_lines=-1):
        df = self.test_rankings_df
        sel_df = df[df.qid==int(query_id)]
        texts = sel_df.text.values
        try:
            labels = sel_df[self.class_name].values
        except KeyError as e:
            print(f'error: key {self.class_name} not found in test rankings')
            raise e
        if max_lines > 0 and len(texts) > max_lines:
            ranks = sel_df.rank.values
            idx = np.argsort(ranks)[:max_lines]
            texts = np.asarray(texts)[idx]
            labels = np.asarray(labels)[idx]
        return texts, labels
    def total(self):
        return len(self._list_queries())
    def _get_query_id_from_path(self, path):
        prefix = 'training_Query-'
        posfix = 'Sample-200SPLIT'
        qid = path
        qid = qid[:qid.index(posfix)]
        qid = qid[qid.index(prefix) + len(prefix):]
        return qid
--- a/Retrieval/deprecated_code/fifth.py
+++ b/Retrieval/deprecated_code/fifth.py
--- a/Retrieval/deprecated_code/fourth.py
+++ b/Retrieval/deprecated_code/fourth.py
--- a/Retrieval/deprecated_code/preliminary_.py
+++ b/Retrieval/deprecated_code/preliminary_.py
--- a/Retrieval/deprecated_code/second.py
+++ b/Retrieval/deprecated_code/second.py
--- a/Retrieval/deprecated_code/third.py
+++ b/Retrieval/deprecated_code/third.py
--- a/Retrieval/tabular.py
+++ b/Retrieval/tabular.py
@ -1,427 +0,0 @@
 import os.path
 import numpy as np
 import itertools
 from scipy.stats import ttest_ind_from_stats, wilcoxon
 from pathlib import Path
 from os.path import join
 class Table:
    VALID_TESTS = [None, "wilcoxon", "ttest"]
    def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
                 clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
                 color=True, color_mode='local', maxtone=50):
        assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
        self.benchmarks = np.asarray(benchmarks)
        self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
        self.methods = np.asarray(methods)
        self.method_index = {col:j for j, col in enumerate(methods)}
        self.map = {}  
        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
        self._addmap('values', dtype=object)
        self.lower_is_better = lower_is_better
        self.ttest = ttest
        self.prec_mean = prec_mean
        self.clean_zero = clean_zero
        self.show_std = show_std
        self.prec_std = prec_std
        self.add_average = average
        self.missing = missing
        self.missing_str = missing_str
        self.color = color
        self.color_mode = color_mode
        self.maxtone = maxtone
        self.touch()
    @property
    def nbenchmarks(self):
        return len(self.benchmarks)
    @property
    def nmethods(self):
        return len(self.methods)
    def touch(self):
        self._modif = True
    def update(self):
        if self._modif:
            self.compute()
    def _getfilled(self):
        return np.argwhere(self.map['fill'])
    @property
    def values(self):
        return self.map['values']
    def _indexes(self):
        return itertools.product(range(self.nbenchmarks), range(self.nmethods))
    def _addmap(self, map, dtype, func=None):
        self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
        if func is None:
            return
        m = self.map[map]
        f = func
        indexes = self._indexes() if map == 'fill' else self._getfilled()
        for i, j in indexes:
            m[i, j] = f(self.values[i, j])
    def _addrank(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
            if not self.lower_is_better:
                ranked_cols_idx = ranked_cols_idx[::-1]
            self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
    def _addcolor(self):
        minval = {}
        maxval = {}
        if self.color_mode == 'global':
            filled_cols_idx = np.argwhere(self.map['fill'])
            col_means = [self.map['mean'][i, j] for i, j in filled_cols_idx]
            if len(filled_cols_idx) > 0:
                global_minval = min(col_means)
                global_maxval = max(col_means)
                for i in range(self.nbenchmarks):
                    minval[i] = global_minval
                    maxval[i] = global_maxval
        elif self.color_mode == 'local':
            for i in range(self.nbenchmarks):
                filled_cols_idx = np.argwhere(self.map['fill'][i, i + 1])
                if len(filled_cols_idx)>0:
                    col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
                    minval[i] = min(col_means)
                    maxval[i] = max(col_means)
        else:
            print(f'color mode {self.color_mode} not understood, valid ones are "local" and "global"; skip')
            return
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            for col_idx in filled_cols_idx:
                val = self.map['mean'][i,col_idx]
                if i not in maxval or i not in minval:
                    continue
                norm = (maxval[i] - minval[i])
                if norm > 0:
                    normval = (val - minval[i]) / norm
                else:
                    normval = 0.5
                if self.lower_is_better:
                    normval = 1 - normval
                normval = np.clip(normval, 0,1)
                self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
    def _run_ttest(self, row, col1, col2):
        mean1 = self.map['mean'][row, col1]
        std1 = self.map['std'][row, col1]
        nobs1 = self.map['nobs'][row, col1]
        mean2 = self.map['mean'][row, col2]
        std2 = self.map['std'][row, col2]
        nobs2 = self.map['nobs'][row, col2]
        _, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
        return p_val
    def _run_wilcoxon(self, row, col1, col2):
        values1 = self.map['values'][row, col1]
        values2 = self.map['values'][row, col2]
        try:
            _, p_val = wilcoxon(values1, values2)
        except ValueError:
            p_val = 0
        return p_val
    def _add_statistical_test(self):
        if self.ttest is None:
            return
        self.some_similar = [False]*self.nmethods
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            if len(filled_cols_idx) <= 1:
                continue
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            best_pos = filled_cols_idx[np.argmin(col_means)]
            for j in filled_cols_idx:
                if j==best_pos:
                    continue
                if self.ttest == 'ttest':
                    p_val = self._run_ttest(i, best_pos, j)
                else:
                    p_val = self._run_wilcoxon(i, best_pos, j)
                pval_outcome = pval_interpretation(p_val)
                self.map['ttest'][i, j] = pval_outcome
                if pval_outcome != 'Diff':
                    self.some_similar[j] = True
    def compute(self):
        self._addmap('fill', dtype=bool, func=lambda x: x is not None)
        self._addmap('mean', dtype=float, func=np.mean)
        self._addmap('std', dtype=float, func=np.std)
        self._addmap('nobs', dtype=float, func=len)
        self._addmap('rank', dtype=int, func=None)
        self._addmap('color', dtype=object, func=None)
        self._addmap('ttest', dtype=object, func=None)
        self._addmap('latex', dtype=object, func=None)
        self._addrank()
        self._addcolor()
        self._add_statistical_test()
        if self.add_average:
            self._addave()
        self._modif = False
    def _is_column_full(self, col):
        return all(self.map['fill'][:, self.method_index[col]])
    def _addave(self):
        ave = Table(['ave'], self.methods,
                    lower_is_better=self.lower_is_better,
                    ttest=self.ttest,
                    average=False,
                    missing=self.missing,
                    missing_str=self.missing_str,
                    prec_mean=self.prec_mean,
                    prec_std=self.prec_std,
                    clean_zero=self.clean_zero,
                    show_std=self.show_std,
                    color=self.color,
                    maxtone=self.maxtone)
        for col in self.methods:
            values = None
            if self._is_column_full(col):
                if self.ttest == 'ttest':
                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
                    values = np.concatenate(self.values[:, self.method_index[col]])
                else:  # wilcoxon
                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
                    values = np.concatenate(self.values[:, self.method_index[col]])
            ave.add('ave', col, values)
        self.average = ave
    def add(self, benchmark, method, values):
        if values is not None:
            values = np.asarray(values)
            if values.ndim==0:
                values = values.flatten()
        rid, cid = self._coordinates(benchmark, method)
        self.map['values'][rid, cid] = values
        self.touch()
    def get(self, benchmark, method, attr='mean'):
        self.update()
        assert attr in self.map, f'unknwon attribute {attr}'
        rid, cid = self._coordinates(benchmark, method)
        if self.map['fill'][rid, cid]:
            v = self.map[attr][rid, cid]
            if v is None or (isinstance(v,float) and np.isnan(v)):
                return self.missing
            return v
        else:
            return self.missing
    def _coordinates(self, benchmark, method):
        assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
        assert method in self.method_index, f'method {method} out of range'
        rid = self.benchmark_index[benchmark]
        cid = self.method_index[method]
        return rid, cid
    def get_average(self, method, attr='mean'):
        self.update()
        if self.add_average:
            return self.average.get('ave', method, attr=attr)
        return None
    def get_color(self, benchmark, method):
        color = self.get(benchmark, method, attr='color')
        if color is None:
            return ''
        return color
    def latex(self, benchmark, method):
        self.update()
        i,j = self._coordinates(benchmark, method)
        if self.map['fill'][i,j] == False:
            return self.missing_str
        mean = self.map['mean'][i,j]
        l = f" {mean:.{self.prec_mean}f}"
        if self.clean_zero:
            l = l.replace(' 0.', '.')
        isbest = self.map['rank'][i,j] == 1
        if isbest:
            l = "\\textbf{"+l.strip()+"}"
        stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
        if self.ttest is not None and self.some_similar[j]:
            test_label = self.map['ttest'][i,j]
            if test_label == 'Sim':
                stat = '^{\dag}'
            elif test_label == 'Same':
                stat = '^{\ddag}'
            elif isbest or test_label == 'Diff':
                stat = '^{\phantom{\ddag}}'
        std = ''
        if self.show_std:
            std = self.map['std'][i,j]
            std = f" {std:.{self.prec_std}f}"
            if self.clean_zero:
                std = std.replace(' 0.', '.')
            std = f"\pm {std:{self.prec_std}}"
        if stat!='' or std!='':
            l = f'{l}${stat}{std}$'
        if self.color:
            l += ' ' + self.map['color'][i,j]
        return l
    def latexPDF(self, path, name:str, *args, **kwargs):
        if not name.endswith('.tex'):
            name += '.tex'
        self.latexSaveDocument(join(path, name), *args, **kwargs)
        print("[Tables Done] runing latex")
        os.chdir(path)
        os.system('pdflatex '+name)
        basename = name.replace('.tex', '')
        os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
        os.chdir('..')
    def latexSaveDocument(self, path, *args, **kwargs):
        document = self.latexDocument(*args, **kwargs)
        parent = Path(path).parent
        os.makedirs(parent, exist_ok=True)
        with open(path, 'wt') as foo:
            foo.write(document)
        print('text file save at ', path)
    def latexDocument(self, *args, **kwargs):
        document = """
 \\documentclass[10pt,a4paper]{article}
 \\usepackage[utf8]{inputenc}
 \\usepackage{amsmath}
 \\usepackage{amsfonts}
 \\usepackage{amssymb}
 \\usepackage{graphicx}
 \\usepackage{xcolor}
 \\usepackage{colortbl}
 \\begin{document}
        """
        document += self.latexTable(*args, **kwargs)
        document += "\n\end{document}\n"
        return document
    def latexTable(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline', resizebox=True):
        table = """
        \\begin{table}
        \center
        %%%\\resizebox{\\textwidth}{!}{% \n
        """
        table += "\n\\begin{tabular}{|c"+"|c" * self.nmethods + "|}\n"
        table += self.latexTabular(benchmark_replace, method_replace, aslines, endl)
        table += "\n\\end{tabular}\n"
        table += """
        %%%}%
        \end{table}
        """
        if resizebox:
            table = table.replace("%%%", "")
        return table
    def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
        lines = []
        l = '\multicolumn{1}{c|}{} & '
        l += ' & '.join([method_replace.get(col, col) for col in self.methods])
        l += ' \\\\\hline'
        lines.append(l)
        for row in self.benchmarks:
            rowname = benchmark_replace.get(row, row)
            l = rowname + ' & '
            l += self.latexRow(row, endl=endl)
            lines.append(l)
        if self.add_average:
            # l += '\hline\n'
            l = '\hline \n \\textit{Average} & '
            l += self.latexAverage(endl=endl)
            lines.append(l)
        if not aslines:
            lines='\n'.join(lines)
        return lines
    def latexRow(self, benchmark, endl='\\\\\hline\n'):
        s = [self.latex(benchmark, col) for col in self.methods]
        s = ' & '.join(s)
        s += ' ' + endl
        return s
    def latexAverage(self, endl='\\\\\hline\n'):
        if self.add_average:
            return self.average.latexRow('ave', endl=endl)
    def getRankTable(self, prec_mean=0):
        t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
        for rid, cid in self._getfilled():
            row = self.benchmarks[rid]
            col = self.methods[cid]
            t.add(row, col, self.get(row, col, 'rank'))
        t.compute()
        return t
    def dropMethods(self, methods):
        drop_index = [self.method_index[m] for m in methods]
        new_methods = np.delete(self.methods, drop_index)
        new_index = {col:j for j, col in enumerate(new_methods)}
        self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
        self.methods = new_methods
        self.method_index = new_index
        self.touch()
 def pval_interpretation(p_val):
    if 0.005 >= p_val:
        return 'Diff'
    elif 0.05 >= p_val > 0.005:
        return 'Sim'
    elif p_val > 0.05:
        return 'Same'
 def color_red2green_01(val, maxtone=50):
    if np.isnan(val): return None
    assert 0 <= val <= 1, f'val {val} out of range [0,1]'
    # rescale to [-1,1]
    val = val * 2 - 1
    if val < 0:
        color = 'red'
        tone = maxtone * (-val)
    else:
        color = 'green'
        tone = maxtone * val
    return '\cellcolor{' + color + f'!{int(tone)}' + '}'
--- a/Retrieval/understand_classif_scheme.py
+++ b/Retrieval/understand_classif_scheme.py
@ -1,66 +0,0 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.metrics import make_scorer, f1_score
 from sklearn.svm import LinearSVC
 from quapy.data.base import LabelledCollection
 from sklearn.model_selection import cross_val_score, GridSearchCV
 from os.path import join
 """
 In this experiment, I simply try to understand whether the learning task can be learned or not.
 The problem is that we are quantifying the categories based on the alphabetical order (of what?).  
 """
 def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
    if verbose:
        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['continent'].values
    if parse_columns:
        rank = df['rank'].values
        scores = df['score'].values
        order = np.argsort(rank)
        X = X[order]
        y = y[order]
        rank = rank[order]
        scores = scores[order]
    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]
    return X, y
 data_path = './50_50_split_trec'
 train_path = join(data_path, 'train_50_50_continent.txt')
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
 data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
 data = data.sampling(20000)
 train, test = data.split_stratified()
 train.instances = tfidf.fit_transform(train.instances)
 test.instances  = tfidf.transform(test.instances)
 # svm = LinearSVC()
 # cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
 cls = LogisticRegression()
 cls.fit(*train.Xy)
 # score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
 # print(score)
 # print(np.mean(score))
 y_pred = cls.predict(test.instances)
 macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
 microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
 print('macro', macrof1)
 print('micro', microf1)