diff --git a/src/main.py b/main.py similarity index 92% rename from src/main.py rename to main.py index e6b25eb..f6bbeae 100644 --- a/src/main.py +++ b/main.py @@ -1,12 +1,11 @@ from argparse import ArgumentParser -from data.dataset_builder import MultilingualDataset -from funnelling import * -from util.common import MultilingualIndex, get_params, get_method_name -from util.evaluation import evaluate -from util.results_csv import CSVlog -from view_generators import * -from time import time +from src.data.dataset_builder import MultilingualDataset +from src.funnelling import * +from src.util.common import MultilingualIndex, get_params, get_method_name +from src.util.evaluation import evaluate +from src.util.results_csv import CSVlog +from src.view_generators import * def main(args): @@ -60,18 +59,17 @@ def main(args): # Training --------------------------------------- print('\n[Training Generalized Funnelling]') - time_init = time() - time_tr = time() + time_init = time.time() gfun.fit(lX, ly) - time_tr = round(time() - time_tr, 3) + time_tr = round(time.time() - time_init, 3) print(f'Training completed in {time_tr} seconds!') # Testing ---------------------------------------- print('\n[Testing Generalized Funnelling]') - time_te = time() + time_te = time.time() ly_ = gfun.predict(lXte) l_eval = evaluate(ly_true=lyte, ly_pred=ly_) - time_te = round(time() - time_te, 3) + time_te = round(time.time() - time_te, 3) print(f'Testing completed in {time_te} seconds!') # Logging --------------------------------------- @@ -101,7 +99,7 @@ def main(args): notes='') print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) - overall_time = round(time() - time_init, 3) + overall_time = round(time.time() - time_init, 3) exit(f'\nExecuted in: {overall_time} seconds!') @@ -112,7 +110,7 @@ if __name__ == '__main__': parser.add_argument('-o', '--output', dest='csv_dir', help='Result file (default ../csv_log/gfun_results.csv)', type=str, - default='csv_logs/gfun/gfun_results.csv') + default='../csv_logs/gfun/gfun_results.csv') parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', help='deploy posterior probabilities embedder to compute document embeddings', @@ -138,7 +136,7 @@ if __name__ == '__main__': help='Optimize SVMs C hyperparameter', default=False) - parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, + parser.add_argument('-n', '--nepochs', dest='nepochs', type=int, help='Number of max epochs to train Recurrent embedder (i.e., -g)') parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, diff --git a/src/requirements.txt b/requirements.txt similarity index 100% rename from src/requirements.txt rename to requirements.txt diff --git a/src/run.sh b/run.sh similarity index 100% rename from src/run.sh rename to run.sh diff --git a/src/data/datamodule.py b/src/data/datamodule.py index da6ec92..bf874c7 100644 --- a/src/data/datamodule.py +++ b/src/data/datamodule.py @@ -135,15 +135,15 @@ class RecurrentDataModule(pl.LightningDataModule): lPad_index=self.multilingualIndex.l_pad()) def train_dataloader(self): - return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, collate_fn=self.training_dataset.collate_fn) def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, collate_fn=self.val_dataset.collate_fn) def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, collate_fn=self.test_dataset.collate_fn) diff --git a/src/data/dataset_builder.py b/src/data/dataset_builder.py index 0e91316..90760cb 100644 --- a/src/data/dataset_builder.py +++ b/src/data/dataset_builder.py @@ -1,5 +1,4 @@ import itertools -import pickle import re from os.path import exists @@ -12,10 +11,10 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import MultiLabelBinarizer from tqdm import tqdm -from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.jrcacquis_reader import * -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents +from src.data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from src.data.reader.jrcacquis_reader import * +from src.data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 +from src.data.text_preprocessor import NLTKStemTokenizer, preprocess_documents class MultilingualDataset: diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py index e911996..e1e3bc2 100644 --- a/src/data/reader/jrcacquis_reader.py +++ b/src/data/reader/jrcacquis_reader.py @@ -14,9 +14,9 @@ import rdflib from rdflib.namespace import RDF, SKOS from sklearn.datasets import get_data_home -from data.languages import JRC_LANGS -from data.languages import lang_set -from util.file import download_file, list_dirs, list_files +from src.data.languages import JRC_LANGS +from src.data.languages import lang_set +from src.util.file import download_file, list_dirs, list_files """ JRC Acquis' Nomenclature: diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py index b3db098..dc2462e 100644 --- a/src/data/reader/rcv_reader.py +++ b/src/data/reader/rcv_reader.py @@ -5,8 +5,8 @@ from zipfile import ZipFile import numpy as np -from util.file import download_file_if_not_exists -from util.file import list_files +from src.util.file import download_file_if_not_exists +from src.util.file import list_files """ RCV2's Nomenclature: diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py index 9558fb6..6ae89ff 100644 --- a/src/data/reader/wikipedia_tools.py +++ b/src/data/reader/wikipedia_tools.py @@ -11,7 +11,6 @@ from os.path import join from xml.sax.saxutils import escape import numpy as np - from util.file import list_dirs, list_files policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py index fcfddba..183df56 100644 --- a/src/data/text_preprocessor.py +++ b/src/data/text_preprocessor.py @@ -2,7 +2,7 @@ from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import SnowballStemmer -from data.languages import NLTK_LANGMAP +from src.data.languages import NLTK_LANGMAP def preprocess_documents(documents, lang): diff --git a/src/funnelling.py b/src/funnelling.py index 812a937..ba2be1b 100644 --- a/src/funnelling.py +++ b/src/funnelling.py @@ -1,6 +1,6 @@ -from models.learners import * -from util.common import _normalize -from view_generators import VanillaFunGen +from src.models.learners import * +from src.util.common import _normalize +from src.view_generators import VanillaFunGen class DocEmbedderList: diff --git a/src/models/learners.py b/src/models/learners.py index 2654109..46737c6 100644 --- a/src/models/learners.py +++ b/src/models/learners.py @@ -7,7 +7,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC -from util.standardizer import StandardizeTransformer +from src.util.standardizer import StandardizeTransformer def get_learner(calibrate=False, kernel='linear', C=1): diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py index 7f2cf59..cd4000b 100755 --- a/src/models/lstm_class.py +++ b/src/models/lstm_class.py @@ -1,7 +1,6 @@ #taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -from torch.autograd import Variable - from models.helpers import * +from torch.autograd import Variable class RNNMultilingualClassifier(nn.Module): diff --git a/src/models/pl_bert.py b/src/models/pl_bert.py index afb28b5..a9b669f 100644 --- a/src/models/pl_bert.py +++ b/src/models/pl_bert.py @@ -3,8 +3,8 @@ import torch from torch.optim.lr_scheduler import StepLR from transformers import BertForSequenceClassification, AdamW -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK +from src.util.common import define_pad_length, pad +from src.util.pl_metrics import CustomF1, CustomK class BertModel(pl.LightningModule): diff --git a/src/models/pl_gru.py b/src/models/pl_gru.py index afb12e6..4adb148 100644 --- a/src/models/pl_gru.py +++ b/src/models/pl_gru.py @@ -7,9 +7,9 @@ from torch.autograd import Variable from torch.optim.lr_scheduler import StepLR from transformers import AdamW -from models.helpers import init_embeddings -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK +from src.models.helpers import init_embeddings +from src.util.common import define_pad_length, pad +from src.util.pl_metrics import CustomF1, CustomK class RecurrentModel(pl.LightningModule): diff --git a/src/util/common.py b/src/util/common.py index 61ac52f..913014c 100644 --- a/src/util/common.py +++ b/src/util/common.py @@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import normalize -from util.embeddings_manager import supervised_embeddings_tfidf +from src.util.embeddings_manager import supervised_embeddings_tfidf class TfidfVectorizerMultilingual: diff --git a/src/util/embeddings_manager.py b/src/util/embeddings_manager.py index 1d708fa..0526582 100644 --- a/src/util/embeddings_manager.py +++ b/src/util/embeddings_manager.py @@ -4,7 +4,7 @@ import numpy as np import torch from torchtext.vocab import Vectors -from util.SIF_embed import remove_pc +from src.util.SIF_embed import remove_pc class PretrainedEmbeddings(ABC): diff --git a/src/util/evaluation.py b/src/util/evaluation.py index 010d0e9..45b8b2b 100644 --- a/src/util/evaluation.py +++ b/src/util/evaluation.py @@ -1,7 +1,6 @@ -import numpy as np from joblib import Parallel, delayed -from util.metrics import * +from src.util.metrics import * def evaluation_metrics(y, y_): diff --git a/src/util/pl_metrics.py b/src/util/pl_metrics.py index bf8aa99..765a6a2 100644 --- a/src/util/pl_metrics.py +++ b/src/util/pl_metrics.py @@ -1,7 +1,7 @@ import torch from pytorch_lightning.metrics import Metric -from util.common import is_false, is_true +from src.util.common import is_false, is_true def _update(pred, target, device): diff --git a/src/view_generators.py b/src/view_generators.py index 384ec76..b0f70bf 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -21,12 +21,12 @@ from time import time from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger -from data.datamodule import RecurrentDataModule, BertDataModule, tokenize -from models.learners import * -from models.pl_bert import BertModel -from models.pl_gru import RecurrentModel -from util.common import TfidfVectorizerMultilingual, _normalize -from util.embeddings_manager import MuseLoader, XdotM, wce_matrix +from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize +from src.models.learners import * +from src.models.pl_bert import BertModel +from src.models.pl_gru import RecurrentModel +from src.util.common import TfidfVectorizerMultilingual, _normalize +from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix class ViewGen(ABC): @@ -232,7 +232,7 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') def _init_model(self): @@ -293,9 +293,9 @@ class RecurrentGen(ViewGen): data = self.multilingualIndex.l_devel_index() self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - time_init = time() + time_init = time.time() l_embeds = self.model.encode(data, l_pad, batch_size=256) - transform_time = round(time() - time_init, 3) + transform_time = round(time.time() - time_init, 3) print(f'Executed! Transform took: {transform_time}') return l_embeds @@ -328,7 +328,7 @@ class BertGen(ViewGen): self.n_jobs = n_jobs self.stored_path = stored_path self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) def _init_model(self): output_size = self.multilingualIndex.get_target_dim() @@ -362,14 +362,12 @@ class BertGen(ViewGen): data = tokenize(data, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - time_init = time() + time_init = time.time() l_emebds = self.model.encode(data, batch_size=64) - transform_time = round(time() - time_init, 3) + transform_time = round(time.time() - time_init, 3) print(f'Executed! Transform took: {transform_time}') return l_emebds def fit_transform(self, lX, ly): # we can assume that we have already indexed data for transform() since we are first calling fit() return self.fit(lX, ly).transform(lX) - -