diff --git a/main.py b/main.py index b99f024..e236c50 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,7 @@ def main(args): print('Running generalized funnelling...') data = MultilingualDataset.load(args.dataset) - data.set_view(languages=['it', 'fr']) + # data.set_view(languages=['it', 'da']) data.show_dimensions() lX, ly = data.training() lXte, lyte = data.test() @@ -42,11 +42,14 @@ def main(args): if args.gru_embedder: rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, - nepochs=args.nepochs_rnn, gpus=args.gpus, n_jobs=args.n_jobs) + nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus, + n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) if args.bert_embedder: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, n_jobs=args.n_jobs) + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, + n_jobs=args.n_jobs) + bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier @@ -137,20 +140,24 @@ if __name__ == '__main__': default=False) parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='', - help='Number of parallel jobs (default is -1, all)', + help='number of parallel jobs (default is -1, all)', default=-1) parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='', - help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150', + help='number of max epochs to train Recurrent embedder (i.e., -g), default 150', default=150) parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='', - help='Number of max epochs to train Bert model (i.e., -g), default 10', + help='number of max epochs to train Bert model (i.e., -g), default 10', default=10) + parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='', + help='set early stop patience for the RecurrentGen, default 50', + default=50) + parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='', - help='Path to the MUSE polylingual word embeddings (default ../embeddings)', - default='../embeddings') + help='Path to the MUSE polylingual word embeddings (default embeddings/)', + default='embeddings/') parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', help='Deploy WCE embedding as embedding layer of the GRU View Generator', diff --git a/readme.md b/readme.md index 4569ba8..401a883 100644 --- a/readme.md +++ b/readme.md @@ -37,11 +37,12 @@ optional arguments: -b, --bert_embedder deploy multilingual Bert to compute document embeddings -g, --gru_embedder deploy a GRU in order to compute document embeddings -c, --c_optimize optimize SVMs C hyperparameter - -j, --n_jobs number of parallel jobs (default is -1, all) + -j, --n_jobs number of parallel jobs, default is -1 i.e., all --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150 --nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10 --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator + --patience_rnn set early stop patience for the RecurrentGen, default 50 --gru_dir set the path to a pretrained GRU model (i.e., -g view generator) --bert_dir set the path to a pretrained mBERT model (i.e., -b view generator) --gpus specifies how many GPUs to use per node diff --git a/run.sh b/run.sh index 04365f9..fd7f4f0 100644 --- a/run.sh +++ b/run.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash -for i in {0..10..1} -do - python main.py --gpus 0 -done \ No newline at end of file +python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0 + +#for i in {0..10..1} +#do +# python main.py --gpus 0 +#done \ No newline at end of file diff --git a/src/view_generators.py b/src/view_generators.py index 9b352f8..fab56c7 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -26,10 +26,10 @@ from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize from src.models.learners import * from src.models.pl_bert import BertModel from src.models.pl_gru import RecurrentModel -from src.util.common import TfidfVectorizerMultilingual, _normalize +from src.util.common import TfidfVectorizerMultilingual, _normalize, index from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix from src.util.file import create_if_not_exist -# TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached +# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached class ViewGen(ABC): @@ -203,7 +203,7 @@ class RecurrentGen(ViewGen): the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. """ def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, - gpus=0, n_jobs=-1, patience=5, stored_path=None): + gpus=0, n_jobs=-1, patience=20, stored_path=None): """ Init RecurrentGen. :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents @@ -237,8 +237,7 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) - # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') + self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') @@ -297,14 +296,19 @@ class RecurrentGen(ViewGen): :param lX: dict {lang: indexed documents} :return: documents projected to the common latent space. """ + data = {} + for lang in lX.keys(): + indexed = index(data=lX[lang], + vocab=self.multilingualIndex.l_index[lang].word2index, + known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()), + analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang), + unk_index=self.multilingualIndex.l_index[lang].unk_index, + out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary) + data[lang] = indexed l_pad = self.multilingualIndex.l_pad() - data = self.multilingualIndex.l_devel_index() self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - # time_init = time.time() l_embeds = self.model.encode(data, l_pad, batch_size=256) - # transform_time = round(time.time() - time_init, 3) - # print(f'Executed! Transform took: {transform_time}') return l_embeds def fit_transform(self, lX, ly): @@ -338,7 +342,7 @@ class BertGen(ViewGen): self.stored_path = stored_path self.model = self._init_model() self.patience = patience - self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') @@ -371,14 +375,10 @@ class BertGen(ViewGen): :param lX: dict {lang: indexed documents} :return: documents projected to the common latent space. """ - data = self.multilingualIndex.l_devel_raw_index() - data = tokenize(data, max_len=512) + data = tokenize(lX, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - # time_init = time.time() l_embeds = self.model.encode(data, batch_size=64) - # transform_time = round(time.time() - time_init, 3) - # print(f'Executed! Transform took: {transform_time}') return l_embeds def fit_transform(self, lX, ly):