From 90e974f0a3d2eb4e2051dcf39efd918ddab50448 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 12:40:23 +0100 Subject: [PATCH] Parser + fixed bert pad token id --- refactor/data/datamodule.py | 1 - refactor/main.py | 149 +++++++++++++++++++++++------------ refactor/models/pl_bert.py | 2 +- refactor/util/common.py | 13 +++ refactor/util/file.py | 8 +- refactor/util/results_csv.py | 5 +- 6 files changed, 124 insertions(+), 54 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 711d5a3..12d7e02 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -147,7 +147,6 @@ def tokenize(l_raw, max_len): :param max_len: :return: """ - # TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') l_tokenized = {} for lang in l_raw.keys(): diff --git a/refactor/main.py b/refactor/main.py index 027649b..bab9189 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -2,60 +2,57 @@ from argparse import ArgumentParser from funnelling import * from view_generators import * from data.dataset_builder import MultilingualDataset -from util.common import MultilingualIndex, get_params +from util.common import MultilingualIndex, get_params, get_method_name from util.evaluation import evaluate from util.results_csv import CSVlog from time import time def main(args): - OPTIMC = False # TODO - N_JOBS = 8 - print('Running refactored...') + assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \ + 'empty set of document embeddings is not allowed!' - # _DATASET = '/homenfs/a.pedrotti1/datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' - # EMBEDDINGS_PATH = '/homenfs/a.pedrotti1/embeddings/MUSE' + print('Running generalized funnelling...') - _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' - EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' - data = MultilingualDataset.load(_DATASET) + data = MultilingualDataset.load(args.dataset) data.set_view(languages=['it', 'fr']) data.show_dimensions() lX, ly = data.training() lXte, lyte = data.test() # Init multilingualIndex - mandatory when deploying Neural View Generators... - multilingualIndex = MultilingualIndex() - lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) - multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) + if args.gru_embedder or args.bert_embedder: + multilingualIndex = MultilingualIndex() + lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) + multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) embedder_list = [] - if args.X: - posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) + if args.post_embedder: + posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) embedder_list.append(posteriorEmbedder) - if args.M: - museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) + if args.muse_embedder: + museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) embedder_list.append(museEmbedder) - if args.W: - wceEmbedder = WordClassGen(n_jobs=N_JOBS) + if args.wce_embedder: + wceEmbedder = WordClassGen(n_jobs=args.n_jobs) embedder_list.append(wceEmbedder) - if args.G: - rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, - nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) + if args.gru_embedder: + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, + nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) - if args.B: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + if args.bert_embedder: + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) + bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) - # Init DocEmbedderList + # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) - meta_parameters = None if not OPTIMC else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), - meta_parameters=get_params(optimc=OPTIMC)) + meta_parameters=get_params(optimc=args.optimc)) # Init Funnelling Architecture gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) @@ -78,39 +75,93 @@ def main(args): # Logging --------------------------------------- print('\n[Results]') - results = CSVlog('test_log.csv') + results = CSVlog(args.csv_dir) metrics = [] for lang in lXte.keys(): macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') - results.add_row(method='gfun', - setting='TODO', - sif='True', - zscore='True', - l2='True', - dataset='TODO', - time_tr=time_tr, - time_te=time_te, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') + if results is not None: + _id, _dataset = get_method_name(args) + results.add_row(method='gfun', + setting=_id, + optimc=args.optimc, + sif='True', + zscore='True', + l2='True', + dataset=_dataset, + time_tr=time_tr, + time_te=time_te, + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) overall_time = round(time() - time_init, 3) - exit(f'\nExecuted in: {overall_time } seconds!') + exit(f'\nExecuted in: {overall_time} seconds!') if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument('--X') - parser.add_argument('--M') - parser.add_argument('--W') - parser.add_argument('--G') - parser.add_argument('--B') - parser.add_argument('--gpus', default=None) + parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani') + + parser.add_argument('dataset', help='Path to the dataset') + + parser.add_argument('-o', '--output', dest='csv_dir', + help='Result file (default ../csv_log/gfun_results.csv)', type=str, + default='csv_logs/gfun/gfun_results.csv') + + parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', + help='deploy posterior probabilities embedder to compute document embeddings', + default=False) + + parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true', + help='deploy (supervised) Word-Class embedder to the compute document embeddings', + default=False) + + parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true', + help='deploy (pretrained) MUSE embedder to compute document embeddings', + default=False) + + parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true', + help='deploy multilingual Bert to compute document embeddings', + default=False) + + parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', + help='deploy a GRU in order to compute document embeddings', + default=False) + + parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', + help='Optimize SVMs C hyperparameter', + default=False) + + parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, + help='Number of max epochs to train Recurrent embedder (i.e., -g)') + + parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, + help='Number of parallel jobs (default is -1, all)', + default=-1) + + parser.add_argument('--muse_dir', dest='muse_dir', type=str, + help='Path to the MUSE polylingual word embeddings (default ../embeddings)', + default='../embeddings') + + parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', + help='Deploy WCE embedding as embedding layer of the GRU View Generator', + default=False) + + parser.add_argument('--gru_dir', dest='gru_dir', type=str, + help='Set the path to a pretrained GRU model (i.e., -g view generator)', + default=None) + + parser.add_argument('--bert_dir', dest='bert_dir', type=str, + help='Set the path to a pretrained mBERT model (i.e., -b view generator)', + default=None) + + parser.add_argument('--gpus', help='specifies how many GPUs to use per node', + default=None) + args = parser.parse_args() main(args) diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 11fe0ce..67f37f4 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -161,7 +161,7 @@ class BertModel(pl.LightningModule): else: batch = lX[lang][i:i + batch_size] max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index='101', max_pad_length=max_pad_len) # TODO: check pad index! + batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') _, output = self.forward(batch) doc_embeds = output[-1][:, 0, :] diff --git a/refactor/util/common.py b/refactor/util/common.py index a624528..0cd95e6 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -369,3 +369,16 @@ def get_params(optimc=False): c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] kernel = 'rbf' return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + + +def get_method_name(args): + _id = '' + _id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder] + _id_name = ['X', 'W', 'M', 'B', 'G'] + for i, conf in enumerate(_id_conf): + if conf: + _id += _id_name[i] + _id = _id if not args.gru_wce else _id + '_wce' + _dataset_path = args.dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + return _id, dataset_id diff --git a/refactor/util/file.py b/refactor/util/file.py index a3d0a3a..98c9910 100644 --- a/refactor/util/file.py +++ b/refactor/util/file.py @@ -1,6 +1,5 @@ from os import listdir, makedirs from os.path import isdir, isfile, join, exists, dirname -#from sklearn.externals.six.moves import urllib import urllib from pathlib import Path @@ -14,6 +13,7 @@ def download_file(url, archive_filename): urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) print("") + def download_file_if_not_exists(url, archive_path): if exists(archive_path): return makedirs_if_not_exist(dirname(archive_path)) @@ -25,20 +25,26 @@ def ls(dir, typecheck): el.sort() return el + def list_dirs(dir): return ls(dir, typecheck=isdir) + def list_files(dir): return ls(dir, typecheck=isfile) + def makedirs_if_not_exist(path): if not exists(path): makedirs(path) + def create_if_not_exist(path): if not exists(path): makedirs(path) + def get_parent_name(path): return Path(path).parent + def get_file_name(path): return Path(path).name diff --git a/refactor/util/results_csv.py b/refactor/util/results_csv.py index 85a7de1..df80c59 100644 --- a/refactor/util/results_csv.py +++ b/refactor/util/results_csv.py @@ -8,6 +8,7 @@ class CSVlog: self.file = file self.columns = ['method', 'setting', + 'optimc', 'sif', 'zscore', 'l2', @@ -34,9 +35,9 @@ class CSVlog: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, method, setting, sif, zscore, l2, dataset, time_tr, time_te, lang, + def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, setting,sif, zscore, l2, dataset, time_tr, time_te, lang, + s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True)