From 515acae15b5159bde18f2f13d2f0d97ec83f73e9 Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 19 Nov 2020 14:30:10 +0100 Subject: [PATCH] rsc branch; load pre-computed VGs' output if already stored in memory --- src/learning/transformers.py | 99 +++++++++++++++++++++--------------- src/main_gFun.py | 41 +++++++++++---- src/models/mBert.py | 9 ++-- src/util/common.py | 1 - 4 files changed, 92 insertions(+), 58 deletions(-) diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 06124c1..3e1fed3 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -13,9 +13,10 @@ from scipy.sparse import csr_matrix from models.mBert import * from models.lstm_class import * from util.csv_log import CSVLog -from util.file import get_file_name +from util.file import get_file_name, create_if_not_exist, exists from util.early_stop import EarlyStopping from util.common import * +import pickle import time @@ -54,7 +55,6 @@ class FeatureWeight: elif self.agg == 'mean': F = tsr_matrix.mean(axis=0) self.lF[l] = F - self.fitted = True return self @@ -71,7 +71,7 @@ class FeatureWeight: class PosteriorProbabilitiesEmbedder: - def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): + def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'): self.fist_tier_learner = first_tier_learner self.fist_tier_parameters = first_tier_parameters self.l2 = l2 @@ -80,8 +80,13 @@ class PosteriorProbabilitiesEmbedder: self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs ) self.requires_tfidf = True + self.storing_path = storing_path + self.is_training = is_training def fit(self, lX, lY, lV=None, called_by_viewgen=False): + if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): + print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') + return self if not called_by_viewgen: # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) print('### Posterior Probabilities View Generator (X)') @@ -90,8 +95,22 @@ class PosteriorProbabilitiesEmbedder: return self def transform(self, lX): + # if dir exist, load and return already computed results + _endpoint = 'tr' if self.is_training else 'te' + _actual_path = self.storing_path + '/' + _endpoint + if exists(_actual_path): + print('NB: loading pre-computed results!') + with open(_actual_path + '/X.pickle', 'rb') as infile: + self.is_training = False + return pickle.load(infile) + lZ = self.predict_proba(lX) lZ = _normalize(lZ, self.l2) + # create dir and dump computed results + create_if_not_exist(_actual_path) + with open(_actual_path + '/X.pickle', 'wb') as outfile: + pickle.dump(lZ, outfile) + self.is_training = False return lZ def fit_transform(self, lX, ly=None, lV=None): @@ -105,10 +124,8 @@ class PosteriorProbabilitiesEmbedder: def predict_proba(self, lX, ly=None): print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') - return self.doc_projector.predict_proba(lX) - - def _get_output_dim(self): - return len(self.doc_projector.model['da'].model.classes_) + lZ = self.doc_projector.predict_proba(lX) + return lZ class MuseEmbedder: @@ -222,8 +239,8 @@ class MBertEmbedder: tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) va_dataset = TrainingDataset(l_split_va, l_split_val_target) - tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) + tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True) + va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True) nC = tr_dataset.get_nclasses() model = get_model(nC) @@ -272,7 +289,7 @@ class MBertEmbedder: l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) feat_dataset = ExtractorDataset(l_tokenized_X) feat_lang_ids = feat_dataset.lang_ids - dataloader = DataLoader(feat_dataset, batch_size=64) + dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) return all_batch_embeddings @@ -326,15 +343,8 @@ class RecurrentEmbedder: self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') - - # Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities - self.posteriorEmbedder = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs) - - def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1): + def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1): print('### Gated Recurrent Unit View Generator (G)') - # self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) - # could be better to init model here at first .fit() call! if self.model is None: print('TODO: Init model!') if not self.is_trained: @@ -358,7 +368,7 @@ class RecurrentEmbedder: tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, ltrain_bert=None) - self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch? + self.lr_scheduler.step() # validation step macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, @@ -384,21 +394,15 @@ class RecurrentEmbedder: ltrain_bert=None) self.is_trained = True - # Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities - # lX = self._get_doc_embeddings(lX) - lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index()) - # Fit a ''multi-lingual'' SVM on the generated doc embeddings - self.posteriorEmbedder.fit(lX, ly) return self def transform(self, lX, batch_size=64): lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) lX = self._get_doc_embeddings(lX) - return self.posteriorEmbedder.predict_proba(lX) + return lX def fit_transform(self, lX, ly, lV=None): - # TODO - return 0 + return self.fit(lX, ly).transform(lX) def _get_doc_embeddings(self, lX, batch_size=64): assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' @@ -418,7 +422,7 @@ class RecurrentEmbedder: # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise def _load_pretrained_embeddings(self, we_path, langs): - lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? + lpretrained = lpretrained_vocabulary = self._none_dict(langs) lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} return lpretrained, lpretrained_vocabulary @@ -495,26 +499,15 @@ class DocEmbedderList: return self.embedders[0].transform(lX) langs = sorted(lX.keys()) - lZparts = {l: None for l in langs} - # min_dim = min([transformer._get_output_dim() for transformer in self.embedders]) - min_dim = 73 # TODO <---- this should be the number of target classes - for transformer in self.embedders: _lX = lX if transformer.requires_tfidf: _lX = tfidf lZ = transformer.transform(_lX) - nC = min([lZ[lang].shape[1] for lang in langs]) for l in langs: Z = lZ[l] - if Z.shape[1] > min_dim: - print( - f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' - f'Applying PCA(n_components={min_dim})') - pca = PCA(n_components=min_dim) - Z = pca.fit(Z).transform(Z) if lZparts[l] is None: lZparts[l] = Z else: @@ -535,7 +528,7 @@ class DocEmbedderList: class FeatureSet2Posteriors: - def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1): + def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'): self.transformer = transformer self.l2 = l2 self.n_jobs = n_jobs @@ -543,7 +536,15 @@ class FeatureSet2Posteriors: SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) self.requires_tfidf = requires_tfidf + self.storing_path = storing_path + self.is_training = True + self.method_id = method_id + def fit(self, lX, ly, lV=None): + if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): + print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') + return self + if lV is None and hasattr(self.transformer, 'lV'): lV = self.transformer.lV lZ = self.transformer.fit_transform(lX, ly, lV) @@ -551,8 +552,22 @@ class FeatureSet2Posteriors: return self def transform(self, lX): + # if dir exist, load and return already computed results + _endpoint = 'tr' if self.is_training else 'te' + _actual_path = self.storing_path + '/' + _endpoint + if exists(_actual_path): + print('NB: loading pre-computed results!') + with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: + self.is_training = False + return pickle.load(infile) + lP = self.predict_proba(lX) lP = _normalize(lP, self.l2) + # create dir and dump computed results + create_if_not_exist(_actual_path) + with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: + pickle.dump(lP, outfile) + self.is_training = False return lP def fit_transform(self, lX, ly, lV): @@ -691,7 +706,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300): def XdotM(X, M, sif): E = X.dot(M) if sif: - print("removing pc...") + # print("removing pc...") E = remove_pc(E, npc=1) return E @@ -714,7 +729,7 @@ class BatchGRU: def batchify(self, l_index, l_post, l_bert, llabels, extractor=False): langs = self.languages - l_num_samples = {l:len(l_index[l]) for l in langs} + l_num_samples = {l: len(l_index[l]) for l in langs} max_samples = max(l_num_samples.values()) n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) diff --git a/src/main_gFun.py b/src/main_gFun.py index 2ad0d30..7d52c48 100644 --- a/src/main_gFun.py +++ b/src/main_gFun.py @@ -28,7 +28,7 @@ if __name__ == '__main__': op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) print(f'Method: gFun{method_name}\nDataset: {dataset_name}') print('-'*50) - + # set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect standardize_range = slice(0, 0) if op.zscore: @@ -36,7 +36,7 @@ if __name__ == '__main__': # load dataset data = MultilingualDataset.load(dataset) - # data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING + data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING data.show_dimensions() lXtr, lytr = data.training() lXte, lyte = data.test() @@ -56,18 +56,26 @@ if __name__ == '__main__': View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means of a set of SVM. """ + # Check if we already have VG outputs from previous runs + VG_name = 'X' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + exist = exists(storing_path) doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear', - C=op.set_c), l2=l2)) + C=op.set_c), + l2=l2, storing_path=storing_path)) if op.supervised: """ View Generator (-W): generates document representation via Word-Class-Embeddings. Document embeddings are obtained via weighted sum of document's constituent embeddings. """ + VG_name = 'W' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + exist = exists(storing_path) wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: - wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2) + wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) doc_embedder.append(wce) if op.pretrained: @@ -75,30 +83,41 @@ if __name__ == '__main__': View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. """ + VG_name = 'M' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + exist = exists(storing_path) muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: - muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2) + muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) doc_embedder.append(muse) if op.gruViewGenerator: """ View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such - document embeddings are then casted into vectors of posterior probabilities via a set of SVM. - NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM. """ + VG_name = 'G' + VG_name += '_muse' if op.gruMUSE else '' + VG_name += '_wce' if op.gruWCE else '' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, options=op, model_path=op.gru_path) + if op.allprob: + rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False, + storing_path=storing_path) doc_embedder.append(rnn_embedder) if op.mbert: """ View generator (-B): generates document embedding via mBERT model. """ - mbert = MBertEmbedder(path_to_model=op.bert_path, - nC=data.num_categories()) + VG_name = 'B' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + + mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories()) if op.allprob: - mbert = FeatureSet2Posteriors(mbert, l2=l2) + mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path) doc_embedder.append(mbert) # metaclassifier diff --git a/src/models/mBert.py b/src/models/mBert.py index e06746c..5c53f55 100644 --- a/src/models/mBert.py +++ b/src/models/mBert.py @@ -5,6 +5,7 @@ from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Be from sklearn.model_selection import train_test_split from util.evaluation import * from time import time +from util.common import show_gpu def predict(logits, classification_type='multilabel'): @@ -21,7 +22,6 @@ def predict(logits, classification_type='multilabel'): class TrainingDataset(Dataset): """ data: dict of lang specific tokenized data - labels: dict of lang specific targets """ def __init__(self, data, labels): @@ -156,7 +156,7 @@ def do_tokenization(l_dataset, max_len=512, verbose=True): def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): # _dataset_path = opt.dataset.split('/')[-1].split('_') # dataset_id = _dataset_path[0] + _dataset_path[-1] - dataset_id = 'TODO fix this!' + dataset_id = 'TODO fix this!' # TODO loss_history = [] model.train() @@ -231,12 +231,13 @@ def feature_extractor(data, lang_ids, model): Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size) """ + show_gpu('Before Training') all_batch_embeddings = {} id2lang = {v: k for k, v in lang_ids.items()} with torch.no_grad(): for batch, lang_idx in data: - # for batch, target, lang_idx in data: out = model(batch.cuda()) + # show_gpu('After Batch Prediction') last_hidden_state = out[1][-1] batch_embeddings = last_hidden_state[:, 0, :] for i, l_idx in enumerate(lang_idx.numpy()): @@ -245,5 +246,5 @@ def feature_extractor(data, lang_ids, model): else: all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], batch_embeddings[i].detach().cpu().numpy())) - + show_gpu('After Full Prediction') return all_batch_embeddings, id2lang diff --git a/src/util/common.py b/src/util/common.py index 219931a..9c62241 100755 --- a/src/util/common.py +++ b/src/util/common.py @@ -4,7 +4,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.model_selection import train_test_split from embeddings.supervised import get_supervised_embeddings -# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual import numpy as np from tqdm import tqdm import torch