diff --git a/refactor/main.py b/refactor/main.py index bab9189..d043d76 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -46,7 +46,6 @@ def main(args): if args.bert_embedder: bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) - bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 67f37f4..48f5b9a 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -22,8 +22,7 @@ class BertModel(pl.LightningModule): self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics - I am not really sure if they should be initialized - # independently or we can use the metrics init above... # TODO: check it + # Language specific metrics to compute metrics at epoch level self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) @@ -71,7 +70,6 @@ class BertModel(pl.LightningModule): langs = set(langs) # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language - # TODO: make this a function (reused in pl_gru epoch_end) res_macroF1 = {lang: [] for lang in langs} res_microF1 = {lang: [] for lang in langs} res_macroK = {lang: [] for lang in langs} diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index ca4f8da..eaf7304 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -41,8 +41,7 @@ class RecurrentModel(pl.LightningModule): self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics - I am not really sure if they should be initialized - # independently or we can use the metrics init above... # TODO: check it + # Language specific metrics to compute metrics at epoch level self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) @@ -110,7 +109,6 @@ class RecurrentModel(pl.LightningModule): def encode(self, lX, l_pad, batch_size=128): """ Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. - # TODO: does not run on gpu.. :param lX: :param l_pad: :param batch_size: @@ -167,7 +165,6 @@ class RecurrentModel(pl.LightningModule): def training_epoch_end(self, outputs): # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language - # TODO: this is horrible... res_macroF1 = {lang: [] for lang in self.langs} res_microF1 = {lang: [] for lang in self.langs} res_macroK = {lang: [] for lang in self.langs} diff --git a/refactor/requirements.txt b/refactor/requirements.txt new file mode 100644 index 0000000..4546a4a --- /dev/null +++ b/refactor/requirements.txt @@ -0,0 +1,12 @@ +transformers==2.11.0 +pandas==0.25.3 +numpy==1.17.4 +joblib==0.14.0 +tqdm==4.50.2 +pytorch_lightning==1.1.2 +torch==1.3.1 +nltk==3.4.5 +scipy==1.3.3 +rdflib==4.2.2 +torchtext==0.4.0 +scikit_learn==0.24.1 diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py index 6781d09..9b44eb0 100644 --- a/refactor/util/pl_metrics.py +++ b/refactor/util/pl_metrics.py @@ -102,10 +102,10 @@ class CustomK(Metric): specificity, recall = 0., 0. absolute_negatives = self.true_negative.sum() + self.false_positive.sum() if absolute_negatives != 0: - specificity = self.true_negative.sum()/absolute_negatives # Todo check if it is float + specificity = self.true_negative.sum()/absolute_negatives absolute_positives = self.true_positive.sum() + self.false_negative.sum() if absolute_positives != 0: - recall = self.true_positive.sum()/absolute_positives # Todo check if it is float + recall = self.true_positive.sum()/absolute_positives if absolute_positives == 0: return 2. * specificity - 1 @@ -125,10 +125,10 @@ class CustomK(Metric): specificity, recall = 0., 0. absolute_negatives = class_tn + class_fp if absolute_negatives != 0: - specificity = class_tn / absolute_negatives # Todo check if it is float + specificity = class_tn / absolute_negatives absolute_positives = class_tp + class_fn if absolute_positives != 0: - recall = class_tp / absolute_positives # Todo check if it is float + recall = class_tp / absolute_positives if absolute_positives == 0: class_specific.append(2. * specificity - 1) diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 2d82a20..e366d7d 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -1,18 +1,19 @@ """ This module contains the view generators that take care of computing the view specific document embeddings: -- VanillaFunGen (-X) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. +- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. -- WordClassGen (-W): generates document representation via Word-Class-Embeddings. +- WordClassGen (-w): generates document representation via Word-Class-Embeddings. Document embeddings are obtained via weighted sum of document's constituent embeddings. -- MuseGen (-M): +- MuseGen (-m): generates document representation via MUSE embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. -- RecurrentGen (-G): generates document embedding by means of a Gated Recurrent Units. The model can be +- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Output dimension is (n_docs, 512). -- View generator (-B): generates document embedding via mBERT model. +- View generator (-b): generates document embedding via mBERT model. """ from abc import ABC, abstractmethod from models.learners import * @@ -153,9 +154,6 @@ class WordClassGen(ViewGen): class RecurrentGen(ViewGen): - # TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5 - # Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint). - # if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint() def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): """