From 94bfe6a0369c272a968204780054a99fbb795516 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 27 Oct 2020 15:08:39 +0100 Subject: [PATCH 01/55] fixed batcher --- src/learning/transformers.py | 40 ++++++++++++++++++------------------ src/util/common.py | 21 ++++++++++++++++--- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 0032460..75d9888 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -301,15 +301,16 @@ class RecurrentEmbedder: self.test_each = test_each self.options = options self.seed = options.seed + self.model_path = model_path self.is_trained = False ## INIT MODEL for training self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True) self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True) self.nC = self.lyte[self.langs[0]].shape[1] - lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs) + lpretrained, self.lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs) self.multilingual_index = MultilingualIndex() - self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary) + self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, self.lpretrained_vocabulary) self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) self.multilingual_index.embedding_matrices(lpretrained, self.supervised) @@ -324,12 +325,15 @@ class RecurrentEmbedder: self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5) self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') + + # Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities self.posteriorEmbedder = MetaClassifier( SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs) def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1): print('### Gated Recurrent Unit View Generator (G)') + # self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) # could be better to init model here at first .fit() call! if self.model is None: print('TODO: Init model!') @@ -381,12 +385,14 @@ class RecurrentEmbedder: self.is_trained = True # Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities - lX = self._get_doc_embeddings(lX) + # lX = self._get_doc_embeddings(lX) + lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index()) # Fit a ''multi-lingual'' SVM on the generated doc embeddings self.posteriorEmbedder.fit(lX, ly) return self def transform(self, lX, batch_size=64): + lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) lX = self._get_doc_embeddings(lX) return self.posteriorEmbedder.predict_proba(lX) @@ -397,28 +403,22 @@ class RecurrentEmbedder: def _get_doc_embeddings(self, lX, batch_size=64): assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' print('Generating document embeddings via GRU') - lX = {} - ly = {} - batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) + _lX = {} - l_devel_index = self.multilingual_index.l_devel_index() l_devel_target = self.multilingual_index.l_devel_target() - for idx, (batch, post, bert_emb, target, lang) in enumerate( - batcher_transform.batchify(l_devel_index, None, None, l_devel_target)): - if lang not in lX.keys(): - lX[lang] = self.model.get_embeddings(batch, lang) - ly[lang] = target.cpu().detach().numpy() + for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target, + batch_size, self.multilingual_index.l_pad())): + if lang not in _lX.keys(): + _lX[lang] = self.model.get_embeddings(batch, lang) else: - lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0) - ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0) + _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0) - return lX + return _lX # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise def _load_pretrained_embeddings(self, we_path, langs): - lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? + lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} return lpretrained, lpretrained_vocabulary @@ -703,14 +703,14 @@ class BatchGRU: self.batchsize = batchsize self.batches_per_epoch = batches_per_epoch self.languages = languages - self.lpad=lpad - self.max_pad_length=max_pad_length + self.lpad = lpad + self.max_pad_length = max_pad_length self.init_offset() def init_offset(self): self.offset = {lang: 0 for lang in self.languages} - def batchify(self, l_index, l_post, l_bert, llabels): + def batchify(self, l_index, l_post, l_bert, llabels, extractor=False): langs = self.languages l_num_samples = {l:len(l_index[l]) for l in langs} diff --git a/src/util/common.py b/src/util/common.py index 88134d3..219931a 100755 --- a/src/util/common.py +++ b/src/util/common.py @@ -180,12 +180,27 @@ class MultilingualIndex: self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l) self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l]) + def get_indexed(self, l_texts, pretrained_vocabulary=None): + assert len(self.l_index) != 0, 'Cannot index data without first index call to multilingual index!' + l_indexed = {} + for l, texts in l_texts.items(): + if l in self.langs: + word2index = self.l_index[l].word2index + known_words = set(word2index.keys()) + if pretrained_vocabulary[l] is not None: + known_words.update(pretrained_vocabulary[l]) + l_indexed[l] = index(texts, + vocab=word2index, + known_words=known_words, + analyzer=self.l_vectorizer.get_analyzer(l), + unk_index=word2index['UNKTOKEN'], + out_of_vocabulary=dict()) + return l_indexed + def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): for l,index in self.l_index.items(): index.train_val_split(val_prop, max_val, seed=seed) - - def embedding_matrices(self, lpretrained, supervised): lXtr = self.get_lXtr() if supervised else none_dict(self.langs) lYtr = self.l_train_target() if supervised else none_dict(self.langs) @@ -385,7 +400,7 @@ class Batch: def init_offset(self): self.offset = {lang: 0 for lang in self.languages} - def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here... + def batchify(self, l_index, l_post, l_bert, llabels): langs = self.languages l_num_samples = {l:len(l_index[l]) for l in langs} From 20dca61e2225fe0dd6e23cda39eb35b83eb02674 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 27 Oct 2020 22:55:25 +0100 Subject: [PATCH 02/55] minor fixes --- src/learning/transformers.py | 4 +++- src/main_gFun.py | 7 +++---- src/util/parser_options.py | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 75d9888..c9a2709 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -822,10 +822,12 @@ def clip_gradient(model, clip_value=1e-1): def init_logfile_nn(method_name, opt): + import os logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) logfile.set_default('dataset', opt.dataset) logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) + logfile.set_default('method', get_method_name(os.path.basename(opt.dataset), opt.posteriors, opt.supervised, opt.pretrained, opt.mbert, + opt.gruViewGenerator, opt.gruMUSE, opt.gruWCE, opt.agg, opt.allprob)) assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ f'and run {opt.seed} already calculated' return logfile diff --git a/src/main_gFun.py b/src/main_gFun.py index c671ecd..2ad0d30 100644 --- a/src/main_gFun.py +++ b/src/main_gFun.py @@ -14,7 +14,8 @@ if __name__ == '__main__': assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \ 'empty set of document embeddings is not allowed' - assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \ + if op.gruViewGenerator: + assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \ 'explicit initialization of GRU View Generator' l2 = op.l2 @@ -35,7 +36,7 @@ if __name__ == '__main__': # load dataset data = MultilingualDataset.load(dataset) - data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING + # data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING data.show_dimensions() lXtr, lytr = data.training() lXte, lyte = data.test() @@ -86,7 +87,6 @@ if __name__ == '__main__': document embeddings are then casted into vectors of posterior probabilities via a set of SVM. NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob """ - op.gru_path = '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' # TODO DEBUG rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, options=op, model_path=op.gru_path) doc_embedder.append(rnn_embedder) @@ -95,7 +95,6 @@ if __name__ == '__main__': """ View generator (-B): generates document embedding via mBERT model. """ - op.bert_path = '/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-rcv1-2_run0' # TODO DEBUG mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories()) if op.allprob: diff --git a/src/util/parser_options.py b/src/util/parser_options.py index 0e751bd..fb5b4c0 100644 --- a/src/util/parser_options.py +++ b/src/util/parser_options.py @@ -24,7 +24,7 @@ parser.add_option('-G', dest='gruViewGenerator', action='store_true', parser.add_option("--l2", dest="l2", action='store_true', help="Activates l2 normalization as a post-processing for the document embedding views", - default=False) + default=True) parser.add_option("--allprob", dest="allprob", action='store_true', help="All views are generated as posterior probabilities. This affects the supervised and pretrained" @@ -51,10 +51,10 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int, default=300) parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=False) + help="Remove common component when computing dot product of word embedding matrices", default=True) parser.add_option("-z", "--zscore", dest="zscore", action='store_true', - help="Z-score normalize matrices (WCE and MUSE)", default=False) + help="Z-score normalize matrices (WCE and MUSE)", default=True) parser.add_option("-a", "--agg", dest="agg", action='store_true', help="Set aggregation function of the common Z-space to average (Default: concatenation)", From 8af763b130eb65bdec9452276a1e0fd958f9664e Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 29 Oct 2020 12:55:07 +0100 Subject: [PATCH 03/55] fixed transform_mean() when dealing with only one VG; modified default settings (new default: aggregation: mean, all_prob:True) --- src/learning/transformers.py | 2 ++ src/util/parser_options.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/learning/transformers.py b/src/learning/transformers.py index c9a2709..06124c1 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -490,6 +490,8 @@ class DocEmbedderList: def transform_mean(self, lX, tfidf): if len(self.embedders) == 1: + if self.embedders[0].requires_tfidf: + lX = tfidf return self.embedders[0].transform(lX) langs = sorted(lX.keys()) diff --git a/src/util/parser_options.py b/src/util/parser_options.py index fb5b4c0..730026f 100644 --- a/src/util/parser_options.py +++ b/src/util/parser_options.py @@ -29,7 +29,7 @@ parser.add_option("--l2", dest="l2", action='store_true', parser.add_option("--allprob", dest="allprob", action='store_true', help="All views are generated as posterior probabilities. This affects the supervised and pretrained" "embeddings, for which a calibrated classifier is generated, which generates the posteriors", - default=False) + default=True) parser.add_option("--feat-weight", dest="feat_weight", help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') @@ -58,7 +58,7 @@ parser.add_option("-z", "--zscore", dest="zscore", action='store_true', parser.add_option("-a", "--agg", dest="agg", action='store_true', help="Set aggregation function of the common Z-space to average (Default: concatenation)", - default=False) + default=True) # ------------------------------------------------------------------------------------ From 515acae15b5159bde18f2f13d2f0d97ec83f73e9 Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 19 Nov 2020 14:30:10 +0100 Subject: [PATCH 04/55] rsc branch; load pre-computed VGs' output if already stored in memory --- src/learning/transformers.py | 99 +++++++++++++++++++++--------------- src/main_gFun.py | 41 +++++++++++---- src/models/mBert.py | 9 ++-- src/util/common.py | 1 - 4 files changed, 92 insertions(+), 58 deletions(-) diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 06124c1..3e1fed3 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -13,9 +13,10 @@ from scipy.sparse import csr_matrix from models.mBert import * from models.lstm_class import * from util.csv_log import CSVLog -from util.file import get_file_name +from util.file import get_file_name, create_if_not_exist, exists from util.early_stop import EarlyStopping from util.common import * +import pickle import time @@ -54,7 +55,6 @@ class FeatureWeight: elif self.agg == 'mean': F = tsr_matrix.mean(axis=0) self.lF[l] = F - self.fitted = True return self @@ -71,7 +71,7 @@ class FeatureWeight: class PosteriorProbabilitiesEmbedder: - def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): + def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'): self.fist_tier_learner = first_tier_learner self.fist_tier_parameters = first_tier_parameters self.l2 = l2 @@ -80,8 +80,13 @@ class PosteriorProbabilitiesEmbedder: self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs ) self.requires_tfidf = True + self.storing_path = storing_path + self.is_training = is_training def fit(self, lX, lY, lV=None, called_by_viewgen=False): + if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): + print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') + return self if not called_by_viewgen: # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) print('### Posterior Probabilities View Generator (X)') @@ -90,8 +95,22 @@ class PosteriorProbabilitiesEmbedder: return self def transform(self, lX): + # if dir exist, load and return already computed results + _endpoint = 'tr' if self.is_training else 'te' + _actual_path = self.storing_path + '/' + _endpoint + if exists(_actual_path): + print('NB: loading pre-computed results!') + with open(_actual_path + '/X.pickle', 'rb') as infile: + self.is_training = False + return pickle.load(infile) + lZ = self.predict_proba(lX) lZ = _normalize(lZ, self.l2) + # create dir and dump computed results + create_if_not_exist(_actual_path) + with open(_actual_path + '/X.pickle', 'wb') as outfile: + pickle.dump(lZ, outfile) + self.is_training = False return lZ def fit_transform(self, lX, ly=None, lV=None): @@ -105,10 +124,8 @@ class PosteriorProbabilitiesEmbedder: def predict_proba(self, lX, ly=None): print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') - return self.doc_projector.predict_proba(lX) - - def _get_output_dim(self): - return len(self.doc_projector.model['da'].model.classes_) + lZ = self.doc_projector.predict_proba(lX) + return lZ class MuseEmbedder: @@ -222,8 +239,8 @@ class MBertEmbedder: tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) va_dataset = TrainingDataset(l_split_va, l_split_val_target) - tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) + tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True) + va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True) nC = tr_dataset.get_nclasses() model = get_model(nC) @@ -272,7 +289,7 @@ class MBertEmbedder: l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) feat_dataset = ExtractorDataset(l_tokenized_X) feat_lang_ids = feat_dataset.lang_ids - dataloader = DataLoader(feat_dataset, batch_size=64) + dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) return all_batch_embeddings @@ -326,15 +343,8 @@ class RecurrentEmbedder: self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') - - # Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities - self.posteriorEmbedder = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs) - - def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1): + def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1): print('### Gated Recurrent Unit View Generator (G)') - # self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) - # could be better to init model here at first .fit() call! if self.model is None: print('TODO: Init model!') if not self.is_trained: @@ -358,7 +368,7 @@ class RecurrentEmbedder: tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, ltrain_bert=None) - self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch? + self.lr_scheduler.step() # validation step macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, @@ -384,21 +394,15 @@ class RecurrentEmbedder: ltrain_bert=None) self.is_trained = True - # Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities - # lX = self._get_doc_embeddings(lX) - lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index()) - # Fit a ''multi-lingual'' SVM on the generated doc embeddings - self.posteriorEmbedder.fit(lX, ly) return self def transform(self, lX, batch_size=64): lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) lX = self._get_doc_embeddings(lX) - return self.posteriorEmbedder.predict_proba(lX) + return lX def fit_transform(self, lX, ly, lV=None): - # TODO - return 0 + return self.fit(lX, ly).transform(lX) def _get_doc_embeddings(self, lX, batch_size=64): assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' @@ -418,7 +422,7 @@ class RecurrentEmbedder: # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise def _load_pretrained_embeddings(self, we_path, langs): - lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? + lpretrained = lpretrained_vocabulary = self._none_dict(langs) lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} return lpretrained, lpretrained_vocabulary @@ -495,26 +499,15 @@ class DocEmbedderList: return self.embedders[0].transform(lX) langs = sorted(lX.keys()) - lZparts = {l: None for l in langs} - # min_dim = min([transformer._get_output_dim() for transformer in self.embedders]) - min_dim = 73 # TODO <---- this should be the number of target classes - for transformer in self.embedders: _lX = lX if transformer.requires_tfidf: _lX = tfidf lZ = transformer.transform(_lX) - nC = min([lZ[lang].shape[1] for lang in langs]) for l in langs: Z = lZ[l] - if Z.shape[1] > min_dim: - print( - f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' - f'Applying PCA(n_components={min_dim})') - pca = PCA(n_components=min_dim) - Z = pca.fit(Z).transform(Z) if lZparts[l] is None: lZparts[l] = Z else: @@ -535,7 +528,7 @@ class DocEmbedderList: class FeatureSet2Posteriors: - def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1): + def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'): self.transformer = transformer self.l2 = l2 self.n_jobs = n_jobs @@ -543,7 +536,15 @@ class FeatureSet2Posteriors: SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) self.requires_tfidf = requires_tfidf + self.storing_path = storing_path + self.is_training = True + self.method_id = method_id + def fit(self, lX, ly, lV=None): + if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): + print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') + return self + if lV is None and hasattr(self.transformer, 'lV'): lV = self.transformer.lV lZ = self.transformer.fit_transform(lX, ly, lV) @@ -551,8 +552,22 @@ class FeatureSet2Posteriors: return self def transform(self, lX): + # if dir exist, load and return already computed results + _endpoint = 'tr' if self.is_training else 'te' + _actual_path = self.storing_path + '/' + _endpoint + if exists(_actual_path): + print('NB: loading pre-computed results!') + with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: + self.is_training = False + return pickle.load(infile) + lP = self.predict_proba(lX) lP = _normalize(lP, self.l2) + # create dir and dump computed results + create_if_not_exist(_actual_path) + with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: + pickle.dump(lP, outfile) + self.is_training = False return lP def fit_transform(self, lX, ly, lV): @@ -691,7 +706,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300): def XdotM(X, M, sif): E = X.dot(M) if sif: - print("removing pc...") + # print("removing pc...") E = remove_pc(E, npc=1) return E @@ -714,7 +729,7 @@ class BatchGRU: def batchify(self, l_index, l_post, l_bert, llabels, extractor=False): langs = self.languages - l_num_samples = {l:len(l_index[l]) for l in langs} + l_num_samples = {l: len(l_index[l]) for l in langs} max_samples = max(l_num_samples.values()) n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) diff --git a/src/main_gFun.py b/src/main_gFun.py index 2ad0d30..7d52c48 100644 --- a/src/main_gFun.py +++ b/src/main_gFun.py @@ -28,7 +28,7 @@ if __name__ == '__main__': op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) print(f'Method: gFun{method_name}\nDataset: {dataset_name}') print('-'*50) - + # set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect standardize_range = slice(0, 0) if op.zscore: @@ -36,7 +36,7 @@ if __name__ == '__main__': # load dataset data = MultilingualDataset.load(dataset) - # data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING + data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING data.show_dimensions() lXtr, lytr = data.training() lXte, lyte = data.test() @@ -56,18 +56,26 @@ if __name__ == '__main__': View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means of a set of SVM. """ + # Check if we already have VG outputs from previous runs + VG_name = 'X' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + exist = exists(storing_path) doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear', - C=op.set_c), l2=l2)) + C=op.set_c), + l2=l2, storing_path=storing_path)) if op.supervised: """ View Generator (-W): generates document representation via Word-Class-Embeddings. Document embeddings are obtained via weighted sum of document's constituent embeddings. """ + VG_name = 'W' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + exist = exists(storing_path) wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: - wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2) + wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) doc_embedder.append(wce) if op.pretrained: @@ -75,30 +83,41 @@ if __name__ == '__main__': View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. """ + VG_name = 'M' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + exist = exists(storing_path) muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: - muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2) + muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) doc_embedder.append(muse) if op.gruViewGenerator: """ View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such - document embeddings are then casted into vectors of posterior probabilities via a set of SVM. - NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM. """ + VG_name = 'G' + VG_name += '_muse' if op.gruMUSE else '' + VG_name += '_wce' if op.gruWCE else '' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, options=op, model_path=op.gru_path) + if op.allprob: + rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False, + storing_path=storing_path) doc_embedder.append(rnn_embedder) if op.mbert: """ View generator (-B): generates document embedding via mBERT model. """ - mbert = MBertEmbedder(path_to_model=op.bert_path, - nC=data.num_categories()) + VG_name = 'B' + storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + + mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories()) if op.allprob: - mbert = FeatureSet2Posteriors(mbert, l2=l2) + mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path) doc_embedder.append(mbert) # metaclassifier diff --git a/src/models/mBert.py b/src/models/mBert.py index e06746c..5c53f55 100644 --- a/src/models/mBert.py +++ b/src/models/mBert.py @@ -5,6 +5,7 @@ from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Be from sklearn.model_selection import train_test_split from util.evaluation import * from time import time +from util.common import show_gpu def predict(logits, classification_type='multilabel'): @@ -21,7 +22,6 @@ def predict(logits, classification_type='multilabel'): class TrainingDataset(Dataset): """ data: dict of lang specific tokenized data - labels: dict of lang specific targets """ def __init__(self, data, labels): @@ -156,7 +156,7 @@ def do_tokenization(l_dataset, max_len=512, verbose=True): def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): # _dataset_path = opt.dataset.split('/')[-1].split('_') # dataset_id = _dataset_path[0] + _dataset_path[-1] - dataset_id = 'TODO fix this!' + dataset_id = 'TODO fix this!' # TODO loss_history = [] model.train() @@ -231,12 +231,13 @@ def feature_extractor(data, lang_ids, model): Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size) """ + show_gpu('Before Training') all_batch_embeddings = {} id2lang = {v: k for k, v in lang_ids.items()} with torch.no_grad(): for batch, lang_idx in data: - # for batch, target, lang_idx in data: out = model(batch.cuda()) + # show_gpu('After Batch Prediction') last_hidden_state = out[1][-1] batch_embeddings = last_hidden_state[:, 0, :] for i, l_idx in enumerate(lang_idx.numpy()): @@ -245,5 +246,5 @@ def feature_extractor(data, lang_ids, model): else: all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], batch_embeddings[i].detach().cpu().numpy())) - + show_gpu('After Full Prediction') return all_batch_embeddings, id2lang diff --git a/src/util/common.py b/src/util/common.py index 219931a..9c62241 100755 --- a/src/util/common.py +++ b/src/util/common.py @@ -4,7 +4,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.model_selection import train_test_split from embeddings.supervised import get_supervised_embeddings -# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual import numpy as np from tqdm import tqdm import torch From a5322ba22708357ac88dd045ede104afb81cce65 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 19 Jan 2021 10:31:57 +0100 Subject: [PATCH 05/55] refactoring --- src/learning/learners.py | 2 +- src/learning/transformers.py | 89 +++++++++++-------- src/main_gFun.py | 35 ++++---- src/models/mBert.py | 7 +- src/util/common.py | 50 +++-------- src/util/metrics.py | 1 - src/util/parser_options.py | 3 + src/util/results.py | 43 +++++++++ .../StandardizeTransformer.py | 1 + 9 files changed, 132 insertions(+), 99 deletions(-) diff --git a/src/learning/learners.py b/src/learning/learners.py index 89e3830..708eaad 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -148,7 +148,7 @@ class MonolingualClassifier: if isinstance(self.model, GridSearchCV): self.best_params_ = self.model.best_params_ print('best parameters: ', self.best_params_) - self.time=time.time()-tinit + self.time = time.time()-tinit return self def decision_function(self, X): diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 3e1fed3..5a76740 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -84,9 +84,9 @@ class PosteriorProbabilitiesEmbedder: self.is_training = is_training def fit(self, lX, lY, lV=None, called_by_viewgen=False): - if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): - print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') - return self + # if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): + # print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') + # return self if not called_by_viewgen: # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) print('### Posterior Probabilities View Generator (X)') @@ -96,20 +96,20 @@ class PosteriorProbabilitiesEmbedder: def transform(self, lX): # if dir exist, load and return already computed results - _endpoint = 'tr' if self.is_training else 'te' - _actual_path = self.storing_path + '/' + _endpoint - if exists(_actual_path): - print('NB: loading pre-computed results!') - with open(_actual_path + '/X.pickle', 'rb') as infile: - self.is_training = False - return pickle.load(infile) + # _endpoint = 'tr' if self.is_training else 'te' + # _actual_path = self.storing_path + '/' + _endpoint + # if exists(_actual_path): + # print('NB: loading pre-computed results!') + # with open(_actual_path + '/X.pickle', 'rb') as infile: + # self.is_training = False + # return pickle.load(infile) lZ = self.predict_proba(lX) lZ = _normalize(lZ, self.l2) # create dir and dump computed results - create_if_not_exist(_actual_path) - with open(_actual_path + '/X.pickle', 'wb') as outfile: - pickle.dump(lZ, outfile) + # create_if_not_exist(_actual_path) + # with open(_actual_path + '/X.pickle', 'wb') as outfile: + # pickle.dump(lZ, outfile) self.is_training = False return lZ @@ -154,8 +154,7 @@ class MuseEmbedder: MUSE = self.MUSE lX = self.featureweight.transform(lX) XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs - ) + delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs) lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} lMuse = _normalize(lMuse, self.l2) return lMuse @@ -211,18 +210,22 @@ class WordClassEmbedder: class MBertEmbedder: def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None, - nC=None): + nC=None, avoid_loading=False): self.doc_embed_path = doc_embed_path self.patience = patience self.checkpoint_dir = checkpoint_dir self.fitted = False self.requires_tfidf = False - if path_to_model is None and nC is not None: + self.avoid_loading = avoid_loading + if path_to_model is None: self.model = None else: config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=nC) - self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() + if self.avoid_loading: + self.model = None + else: + self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results! self.fitted = True def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1): @@ -235,7 +238,7 @@ class MBertEmbedder: l_tokenized_tr = do_tokenization(lX, max_len=512) l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly, val_prop=0.2, max_val=2000, - seed=seed) # TODO: seed + seed=seed) tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) va_dataset = TrainingDataset(l_split_va, l_split_val_target) @@ -289,7 +292,7 @@ class MBertEmbedder: l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) feat_dataset = ExtractorDataset(l_tokenized_X) feat_lang_ids = feat_dataset.lang_ids - dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments + dataloader = DataLoader(feat_dataset, batch_size=64) all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) return all_batch_embeddings @@ -301,7 +304,7 @@ class RecurrentEmbedder: def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3, we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10, - test_each=0, checkpoint_dir='../checkpoint', model_path=None): + test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1): self.pretrained = pretrained self.supervised = supervised self.concat = concat @@ -319,6 +322,7 @@ class RecurrentEmbedder: self.options = options self.seed = options.seed self.model_path = model_path + self.n_jobs = n_jobs self.is_trained = False ## INIT MODEL for training @@ -398,32 +402,33 @@ class RecurrentEmbedder: def transform(self, lX, batch_size=64): lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) - lX = self._get_doc_embeddings(lX) + lX = self._get_doc_embeddings(lX, batch_size) return lX def fit_transform(self, lX, ly, lV=None): return self.fit(lX, ly).transform(lX) - def _get_doc_embeddings(self, lX, batch_size=64): + def _get_doc_embeddings(self, lX, batch_size): assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' print('Generating document embeddings via GRU') _lX = {} l_devel_target = self.multilingual_index.l_devel_target() + # show_gpu('RNN init at extraction') for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target, batch_size, self.multilingual_index.l_pad())): if lang not in _lX.keys(): _lX[lang] = self.model.get_embeddings(batch, lang) else: _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0) - + # show_gpu('RNN after batch pred at extraction') return _lX # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise def _load_pretrained_embeddings(self, we_path, langs): lpretrained = lpretrained_vocabulary = self._none_dict(langs) - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) + lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs) lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} return lpretrained, lpretrained_vocabulary @@ -553,20 +558,20 @@ class FeatureSet2Posteriors: def transform(self, lX): # if dir exist, load and return already computed results - _endpoint = 'tr' if self.is_training else 'te' - _actual_path = self.storing_path + '/' + _endpoint - if exists(_actual_path): - print('NB: loading pre-computed results!') - with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: - self.is_training = False - return pickle.load(infile) + # _endpoint = 'tr' if self.is_training else 'te' + # _actual_path = self.storing_path + '/' + _endpoint + # if exists(_actual_path): + # print('NB: loading pre-computed results!') + # with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: + # self.is_training = False + # return pickle.load(infile) lP = self.predict_proba(lX) lP = _normalize(lP, self.l2) # create dir and dump computed results - create_if_not_exist(_actual_path) - with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: - pickle.dump(lP, outfile) + # create_if_not_exist(_actual_path) + # with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: + # pickle.dump(lP, outfile) self.is_training = False return lP @@ -637,8 +642,14 @@ class Funnelling: self.meta = meta self.n_jobs = meta.n_jobs - def fit(self, lX, ly): - tfidf_lX = self.vectorizer.fit_transform(lX, ly) + def fit(self, lX, ly, target_lang=None): + if target_lang is not None: + LX = lX.copy() + LX.update(target_lang) + self.vectorizer.fit(LX) + tfidf_lX = self.vectorizer.transform(lX) + else: + tfidf_lX = self.vectorizer.fit_transform(lX, ly) lV = self.vectorizer.vocabulary() print('## Fitting first-tier learners!') lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX) @@ -774,6 +785,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt _dataset_path = opt.dataset.split('/')[-1].split('_') dataset_id = _dataset_path[0] + _dataset_path[-1] + # show_gpu('RNN init pre-training') loss_history = [] model.train() for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): @@ -783,6 +795,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt clip_gradient(model) optim.step() loss_history.append(loss.item()) + # show_gpu('RNN after batch prediction') if idx % log_interval == 0: interval_loss = np.mean(loss_history[-log_interval:]) @@ -810,7 +823,7 @@ def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tini yte_stacked[lang].append(target.detach().cpu().numpy()) loss_history.append(loss) - ly = {l:np.vstack(yte_stacked[l]) for l in langs} + ly = {l:np.vstack(yte_stacked[l]) for l in langs} ly_ = {l:np.vstack(predictions[l]) for l in langs} l_eval = evaluate(ly, ly_) metrics = [] diff --git a/src/main_gFun.py b/src/main_gFun.py index 7d52c48..8694087 100644 --- a/src/main_gFun.py +++ b/src/main_gFun.py @@ -13,30 +13,31 @@ if __name__ == '__main__': assert exists(dataset), 'Unable to find file '+str(dataset) assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \ - 'empty set of document embeddings is not allowed' + 'empty set of document embeddings is not allowed' if op.gruViewGenerator: assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \ - 'explicit initialization of GRU View Generator' + 'explicit initialization of GRU View Generator' l2 = op.l2 dataset_file = os.path.basename(dataset) results = PolylingualClassificationResults('../log/' + op.output) allprob = 'Prob' if op.allprob else '' - # renaming arguments to be printed on log method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert, op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) + print(f'Method: gFun{method_name}\nDataset: {dataset_name}') print('-'*50) - # set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect + n_jobs = -1 # TODO SETTING n_JOBS + standardize_range = slice(0, 0) if op.zscore: standardize_range = None # load dataset data = MultilingualDataset.load(dataset) - data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING + # data.set_view(languages=['it']) # TODO: DEBUG SETTING data.show_dimensions() lXtr, lytr = data.training() lXte, lyte = data.test() @@ -63,7 +64,7 @@ if __name__ == '__main__': doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear', C=op.set_c), - l2=l2, storing_path=storing_path)) + l2=l2, storing_path=storing_path, n_jobs=n_jobs)) if op.supervised: """ @@ -73,9 +74,11 @@ if __name__ == '__main__': VG_name = 'W' storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' exist = exists(storing_path) - wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) + wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, + sif=op.sif, n_jobs=n_jobs) if op.allprob: - wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) + wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path, + n_jobs=n_jobs) doc_embedder.append(wce) if op.pretrained: @@ -86,9 +89,10 @@ if __name__ == '__main__': VG_name = 'M' storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' exist = exists(storing_path) - muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) + muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs) if op.allprob: - muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) + muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path, + n_jobs=n_jobs) doc_embedder.append(muse) if op.gruViewGenerator: @@ -100,12 +104,12 @@ if __name__ == '__main__': VG_name = 'G' VG_name += '_muse' if op.gruMUSE else '' VG_name += '_wce' if op.gruWCE else '' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, - options=op, model_path=op.gru_path) + options=op, model_path=None, n_jobs=n_jobs) if op.allprob: rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False, - storing_path=storing_path) + storing_path=storing_path, n_jobs=n_jobs) doc_embedder.append(rnn_embedder) if op.mbert: @@ -114,8 +118,9 @@ if __name__ == '__main__': """ VG_name = 'B' storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' + avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run)) - mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories()) + mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading) if op.allprob: mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path) doc_embedder.append(mbert) @@ -123,7 +128,7 @@ if __name__ == '__main__': # metaclassifier meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c), - meta_parameters=get_params(op.optimc), standardize_range=standardize_range) + meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs) # ensembling the modules classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) diff --git a/src/models/mBert.py b/src/models/mBert.py index 5c53f55..56695a6 100644 --- a/src/models/mBert.py +++ b/src/models/mBert.py @@ -20,9 +20,6 @@ def predict(logits, classification_type='multilabel'): class TrainingDataset(Dataset): - """ - data: dict of lang specific tokenized data - """ def __init__(self, data, labels): self.langs = data.keys() @@ -231,7 +228,7 @@ def feature_extractor(data, lang_ids, model): Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size) """ - show_gpu('Before Training') + # show_gpu('Before Training') all_batch_embeddings = {} id2lang = {v: k for k, v in lang_ids.items()} with torch.no_grad(): @@ -246,5 +243,5 @@ def feature_extractor(data, lang_ids, model): else: all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], batch_embeddings[i].detach().cpu().numpy())) - show_gpu('After Full Prediction') + # show_gpu('After Full Prediction') return all_batch_embeddings, id2lang diff --git a/src/util/common.py b/src/util/common.py index 9c62241..48a0525 100755 --- a/src/util/common.py +++ b/src/util/common.py @@ -74,7 +74,7 @@ class Index: self.test_raw = test_raw def index(self, pretrained_vocabulary, analyzer, vocabulary): - self.word2index = dict(vocabulary) + self.word2index = dict(vocabulary) # word2idx known_words = set(self.word2index.keys()) if pretrained_vocabulary is not None: known_words.update(pretrained_vocabulary) @@ -207,44 +207,6 @@ class MultilingualIndex: index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) self.sup_range = index.wce_range - # TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers - # def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False): - # # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs - # timeit = time.time() - # lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} - # lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} - # if not stored_post: - # for l in self.langs: - # n_elements = lXtr[l].shape[0] - # if n_elements > max_training_docs_by_lang: - # choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] - # lXtr[l] = lXtr[l][choice] - # lYtr[l] = lYtr[l][choice] - # - # # train the posterior probabilities embedder - # print('[posteriors] training a calibrated SVM') - # learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') - # prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) - # prob_embedder.fit(lXtr, lYtr) - # - # # transforms the training, validation, and test sets into posterior probabilities - # print('[posteriors] generating posterior probabilities') - # lPtr = prob_embedder.transform(self.get_lXtr()) - # lPva = prob_embedder.transform(self.get_lXva()) - # lPte = prob_embedder.transform(self.get_lXte()) - # # NB: Check splits indices ! - # if store_posteriors: - # import pickle - # with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile: - # pickle.dump([lPtr, lPva, lPte], outfile) - # print(f'Successfully dumped posteriors!') - # else: - # import pickle - # with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile: - # lPtr, lPva, lPte = pickle.load(infile) - # print(f'Successfully loaded stored posteriors!') - # print(f'[posteriors] done in {time.time() - timeit}') - # return lPtr, lPva, lPte def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False): show_gpu('GPU memory before initializing mBert model:') @@ -518,10 +480,12 @@ class TfidfVectorizerMultilingual: def fit(self, lX, ly=None): self.langs = sorted(lX.keys()) self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + # self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()} return self def transform(self, lX): return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} + # return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()} def fit_transform(self, lX, ly=None): return self.fit(lX, ly).transform(lX) @@ -568,3 +532,11 @@ def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru, dataset_id = _dataset_path[0] + _dataset_path[-1] return _id, dataset_id + +def get_zscl_setting(langs): + settings = [] + for elem in langs: + for tar in langs: + if elem != tar: + settings.append((elem, tar)) + return settings \ No newline at end of file diff --git a/src/util/metrics.py b/src/util/metrics.py index 9f6bc24..ca688b7 100644 --- a/src/util/metrics.py +++ b/src/util/metrics.py @@ -1,5 +1,4 @@ import numpy as np -import numpy as np from scipy.sparse import lil_matrix, issparse from sklearn.metrics import f1_score, accuracy_score diff --git a/src/util/parser_options.py b/src/util/parser_options.py index 730026f..14d827c 100644 --- a/src/util/parser_options.py +++ b/src/util/parser_options.py @@ -60,6 +60,9 @@ parser.add_option("-a", "--agg", dest="agg", action='store_true', help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=True) +parser.add_option("-l", dest="avoid_loading", action="store_true", + help="TODO", default=False) + # ------------------------------------------------------------------------------------ parser.add_option('--hidden', type=int, default=512, metavar='int', diff --git a/src/util/results.py b/src/util/results.py index ec66fc1..6526303 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -47,3 +47,46 @@ class PolylingualClassificationResults: def tell(self, msg): if self.verbose: print(msg) + + +class ZSCLResults: + def __init__(self, file, autoflush=True, verbose=False): + self.file = file + self.columns = ['method', + 'optimp', + 'source', + 'target', + 'id', + 'dataset', + 'time', + 'lang', + 'macrof1', + 'microf1', + 'macrok', + 'microk', + 'notes'] + self.autoflush = autoflush + self.verbose = verbose + if os.path.exists(file): + self.tell('Loading existing file from {}'.format(file)) + self.df = pd.read_csv(file, sep='\t') + else: + self.tell('File {} does not exist. Creating new frame.'.format(file)) + dir = os.path.dirname(self.file) + if dir and not os.path.exists(dir): os.makedirs(dir) + self.df = pd.DataFrame(columns=self.columns) + + def already_calculated(self, id): + return (self.df['id'] == id).any() + + def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + self.df = self.df.append(s, ignore_index=True) + if self.autoflush: self.flush() + self.tell(s.to_string()) + + def flush(self): + self.df.to_csv(self.file, index=False, sep='\t') + + def tell(self, msg): + if self.verbose: print(msg) diff --git a/src/util_transformers/StandardizeTransformer.py b/src/util_transformers/StandardizeTransformer.py index 06e633e..e1a10cf 100644 --- a/src/util_transformers/StandardizeTransformer.py +++ b/src/util_transformers/StandardizeTransformer.py @@ -1,5 +1,6 @@ import numpy as np + class StandardizeTransformer: def __init__(self, axis=0, range=None): From 79eae9003bea99c39ce696cf119a017688578c17 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 19 Jan 2021 10:34:01 +0100 Subject: [PATCH 06/55] refactoring --- src/data/__init__.py | 0 src/data/languages.py | 42 - src/data/reader/__init__.py | 0 src/data/reader/jrcacquis_reader.py | 321 ------- src/data/reader/rcv_reader.py | 225 ----- src/data/reader/wikipedia_tools.py | 304 ------ src/data/text_preprocessor.py | 33 - src/data/tsr_function__.py | 270 ------ src/dataset_builder.py | 710 -------------- src/embeddings/__init__.py | 0 src/embeddings/embeddings.py | 66 -- src/embeddings/pretrained.py | 102 --- src/embeddings/supervised.py | 74 -- src/experiment_scripts/10run_dl_jrc.sh | 11 - src/experiment_scripts/10run_dl_rcv.sh | 11 - src/experiment_scripts/10run_jrc.sh | 12 - .../10run_jrc_combinations.sh | 16 - src/experiment_scripts/10run_rcv.sh | 15 - .../10run_rcv_combinations.sh | 16 - src/experiment_scripts/extract_features.sh | 14 - src/experiment_scripts/main_deep_learning.py | 329 ------- src/experiment_scripts/main_embeddings_cls.py | 127 --- .../main_majorityvoting_cls.py | 155 ---- src/experiment_scripts/main_mbert.py | 390 -------- .../main_mbert_extractor.py | 110 --- .../main_qualitative_analysis.py | 49 - .../run_combinations_jrc.sh | 34 - .../run_combinations_rcv.sh | 31 - src/experiment_scripts/run_dl_jrc.sh | 31 - src/experiment_scripts/run_dl_rcv.sh | 30 - src/experiment_scripts/run_fulljrc_dl.sh | 16 - src/experiment_scripts/run_fullrcv_dl.sh | 20 - src/experiment_scripts/run_fun_bert_jrc.sh | 16 - src/experiment_scripts/run_fun_bert_rcv.sh | 16 - src/experiment_scripts/run_mbert_jrc.sh | 15 - src/experiment_scripts/run_mbert_rcv.sh | 15 - src/experiment_scripts/run_traditional_jrc.sh | 45 - src/experiment_scripts/run_traditional_rcv.sh | 45 - src/experiment_scripts/time_comparison.sh | 6 - src/learning/learners.py | 171 ---- src/learning/transformers.py | 863 ------------------ src/main_gFun.py | 166 ---- src/models/cnn_class_bu.py | 42 - src/models/helpers.py | 47 - src/models/lstm_class.py | 114 --- src/models/mBert.py | 247 ----- src/results/results_manager.py | 11 - src/util/SIF_embed.py | 56 -- src/util/common.py | 542 ----------- src/util/csv_log.py | 60 -- src/util/decompositions.py | 50 - src/util/early_stop.py | 71 -- src/util/evaluation.py | 102 --- src/util/file.py | 44 - src/util/metrics.py | 255 ------ src/util/parser_options.py | 94 -- src/util/results.py | 92 -- src/util/util.py | 29 - .../StandardizeTransformer.py | 32 - src/util_transformers/__init__.py | 0 src/util_transformers/clesa.py | 110 --- src/util_transformers/dci.py | 154 ---- src/util_transformers/riboc.py | 53 -- 63 files changed, 7127 deletions(-) delete mode 100644 src/data/__init__.py delete mode 100644 src/data/languages.py delete mode 100644 src/data/reader/__init__.py delete mode 100644 src/data/reader/jrcacquis_reader.py delete mode 100644 src/data/reader/rcv_reader.py delete mode 100644 src/data/reader/wikipedia_tools.py delete mode 100644 src/data/text_preprocessor.py delete mode 100755 src/data/tsr_function__.py delete mode 100644 src/dataset_builder.py delete mode 100644 src/embeddings/__init__.py delete mode 100644 src/embeddings/embeddings.py delete mode 100644 src/embeddings/pretrained.py delete mode 100755 src/embeddings/supervised.py delete mode 100644 src/experiment_scripts/10run_dl_jrc.sh delete mode 100644 src/experiment_scripts/10run_dl_rcv.sh delete mode 100644 src/experiment_scripts/10run_jrc.sh delete mode 100644 src/experiment_scripts/10run_jrc_combinations.sh delete mode 100644 src/experiment_scripts/10run_rcv.sh delete mode 100644 src/experiment_scripts/10run_rcv_combinations.sh delete mode 100644 src/experiment_scripts/extract_features.sh delete mode 100755 src/experiment_scripts/main_deep_learning.py delete mode 100644 src/experiment_scripts/main_embeddings_cls.py delete mode 100644 src/experiment_scripts/main_majorityvoting_cls.py delete mode 100644 src/experiment_scripts/main_mbert.py delete mode 100644 src/experiment_scripts/main_mbert_extractor.py delete mode 100644 src/experiment_scripts/main_qualitative_analysis.py delete mode 100644 src/experiment_scripts/run_combinations_jrc.sh delete mode 100644 src/experiment_scripts/run_combinations_rcv.sh delete mode 100644 src/experiment_scripts/run_dl_jrc.sh delete mode 100644 src/experiment_scripts/run_dl_rcv.sh delete mode 100644 src/experiment_scripts/run_fulljrc_dl.sh delete mode 100644 src/experiment_scripts/run_fullrcv_dl.sh delete mode 100644 src/experiment_scripts/run_fun_bert_jrc.sh delete mode 100644 src/experiment_scripts/run_fun_bert_rcv.sh delete mode 100644 src/experiment_scripts/run_mbert_jrc.sh delete mode 100644 src/experiment_scripts/run_mbert_rcv.sh delete mode 100644 src/experiment_scripts/run_traditional_jrc.sh delete mode 100644 src/experiment_scripts/run_traditional_rcv.sh delete mode 100644 src/experiment_scripts/time_comparison.sh delete mode 100644 src/learning/learners.py delete mode 100644 src/learning/transformers.py delete mode 100644 src/main_gFun.py delete mode 100644 src/models/cnn_class_bu.py delete mode 100755 src/models/helpers.py delete mode 100755 src/models/lstm_class.py delete mode 100644 src/models/mBert.py delete mode 100644 src/results/results_manager.py delete mode 100644 src/util/SIF_embed.py delete mode 100755 src/util/common.py delete mode 100755 src/util/csv_log.py delete mode 100644 src/util/decompositions.py delete mode 100755 src/util/early_stop.py delete mode 100644 src/util/evaluation.py delete mode 100644 src/util/file.py delete mode 100644 src/util/metrics.py delete mode 100644 src/util/parser_options.py delete mode 100644 src/util/results.py delete mode 100644 src/util/util.py delete mode 100644 src/util_transformers/StandardizeTransformer.py delete mode 100644 src/util_transformers/__init__.py delete mode 100644 src/util_transformers/clesa.py delete mode 100644 src/util_transformers/dci.py delete mode 100644 src/util_transformers/riboc.py diff --git a/src/data/__init__.py b/src/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/data/languages.py b/src/data/languages.py deleted file mode 100644 index 2d03d8e..0000000 --- a/src/data/languages.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', - 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} - - -#top 10 languages in wikipedia order by the number of articles -#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] - -#all languages in JRC-acquis v3 -JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] -JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' - -RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] -RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] - -lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, - 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} - diff --git a/src/data/reader/__init__.py b/src/data/reader/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py deleted file mode 100644 index c0441ed..0000000 --- a/src/data/reader/jrcacquis_reader.py +++ /dev/null @@ -1,321 +0,0 @@ -from __future__ import print_function -import os, sys -from os.path import join -import tarfile -import xml.etree.ElementTree as ET -from sklearn.datasets import get_data_home -import pickle -from util.file import download_file, list_dirs, list_files -import rdflib -from rdflib.namespace import RDF, SKOS -from rdflib import URIRef -import zipfile -from data.languages import JRC_LANGS -from collections import Counter -from random import shuffle -from data.languages import lang_set - -""" -JRC Acquis' Nomenclature: -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -class JRCAcquis_Document: - def __init__(self, id, name, lang, year, head, body, categories): - self.id = id - self.parallel_id = name - self.lang = lang - self.year = year - self.text = body if not head else head + "\n" + body - self.categories = categories - -# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles -# however, it seems that the title is often appearing as the first paragraph in the text/body (with -# standard codification), so it might be preferable not to read the header after all (as here by default) -def _proc_acute(text): - for ch in ['a','e','i','o','u']: - text = text.replace('%'+ch+'acute%',ch) - return text - -def parse_document(file, year, head=False): - root = ET.parse(file).getroot() - - doc_name = root.attrib['n'] # e.g., '22006A0211(01)' - doc_lang = root.attrib['lang'] # e.g., 'es' - doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' - doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] - doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' - doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) - - def raise_if_empty(field, from_file): - if isinstance(field, str): - if not field.strip(): - raise ValueError("Empty field in file %s" % from_file) - - raise_if_empty(doc_name, file) - raise_if_empty(doc_lang, file) - raise_if_empty(doc_id, file) - if head: raise_if_empty(doc_head, file) - raise_if_empty(doc_body, file) - - return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) - -# removes documents without a counterpart in all other languages -def _force_parallel(doclist, langs): - n_langs = len(langs) - par_id_count = Counter([d.parallel_id for d in doclist]) - parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) - return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] - -def random_sampling_avoiding_parallel(doclist): - random_order = list(range(len(doclist))) - shuffle(random_order) - sampled_request = [] - parallel_ids = set() - for ind in random_order: - pid = doclist[ind].parallel_id - if pid not in parallel_ids: - sampled_request.append(doclist[ind]) - parallel_ids.add(pid) - print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) - return sampled_request - - -#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter -def _filter_by_category(doclist, cat_filter): - if not isinstance(cat_filter, frozenset): - cat_filter = frozenset(cat_filter) - filtered = [] - for doc in doclist: - doc.categories = list(cat_filter & set(doc.categories)) - if doc.categories: - doc.categories.sort() - filtered.append(doc) - print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) - return filtered - -#filters out categories with less than cat_threshold documents (and filters documents containing those categories) -def _filter_by_frequency(doclist, cat_threshold): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -#select top most_frequent categories (and filters documents containing those categories) -def _most_common(doclist, most_frequent): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -def _get_categories(request): - final_cats = set() - for d in request: - final_cats.update(d.categories) - return list(final_cats) - -def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, - parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): - - assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' - if not langs: - langs = JRC_LANGS - else: - if isinstance(langs, str): langs = [langs] - for l in langs: - if l not in JRC_LANGS: - raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) - - if not data_path: - data_path = get_data_home() - - if not os.path.exists(data_path): - os.mkdir(data_path) - - request = [] - total_read = 0 - for l in langs: - file_name = 'jrc-'+l+'.tgz' - archive_path = join(data_path, file_name) - - if not os.path.exists(archive_path): - print("downloading language-specific dataset (once and for all) into %s" % data_path) - DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) - download_file(DOWNLOAD_URL, archive_path) - print("untarring dataset...") - tarfile.open(archive_path, 'r:gz').extractall(data_path) - - documents_dir = join(data_path, l) - - print("Reading documents...") - read = 0 - for dir in list_dirs(documents_dir): - year = int(dir) - if years==None or year in years: - year_dir = join(documents_dir,dir) - pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') - if os.path.exists(pickle_name): - print("loading from file %s" % pickle_name) - l_y_documents = pickle.load(open(pickle_name, "rb")) - read += len(l_y_documents) - else: - l_y_documents = [] - all_documents = list_files(year_dir) - empty = 0 - for i,doc_file in enumerate(all_documents): - try: - jrc_doc = parse_document(join(year_dir, doc_file), year) - except ValueError: - jrc_doc = None - - if jrc_doc and (not ignore_unclassified or jrc_doc.categories): - l_y_documents.append(jrc_doc) - else: empty += 1 - if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): - print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') - read+=1 - print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') - print("\t\t(Pickling object for future runs in %s)" % pickle_name) - pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - request += l_y_documents - print("Read %d documents for language %s\n" % (read, l)) - total_read += read - print("Read %d documents in total" % (total_read)) - - if parallel=='force': - request = _force_parallel(request, langs) - elif parallel == 'avoid': - request = random_sampling_avoiding_parallel(request) - - final_cats = _get_categories(request) - - if cat_filter: - request = _filter_by_category(request, cat_filter) - final_cats = _get_categories(request) - if cat_threshold > 0: - request, final_cats = _filter_by_frequency(request, cat_threshold) - if most_frequent != -1 and len(final_cats) > most_frequent: - request, final_cats = _most_common(request, most_frequent) - - return request, final_cats - -def print_cat_analysis(request): - cat_count = Counter() - for d in request: - cat_count.update(d.categories) - print("Number of active categories: {}".format(len(cat_count))) - print(cat_count.most_common()) - -# inspects the Eurovoc thesaurus in order to select a subset of categories -# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented -def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', - eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", - select="broadest"): - - fullpath_pickle = join(data_path, select+'_concepts.pickle') - if os.path.exists(fullpath_pickle): - print("Pickled object found in %s. Loading it." % fullpath_pickle) - return pickle.load(open(fullpath_pickle,'rb')) - - fullpath = join(data_path, eurovoc_skos_core_concepts_filename) - if not os.path.exists(fullpath): - print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) - download_file(eurovoc_url, fullpath) - print("Unzipping file...") - zipped = zipfile.ZipFile(data_path + '.zip', 'r') - zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) - zipped.close() - - print("Parsing %s" %fullpath) - g = rdflib.Graph() - g.parse(location=fullpath, format="application/rdf+xml") - - if select == "all": - print("Selecting all concepts") - all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) - all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] - all_concepts.sort() - selected_concepts = all_concepts - elif select=="broadest": - print("Selecting broadest concepts (those without any other broader concept linked to it)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - narrower_concepts = set(g.subjects(SKOS.broader, None)) - broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] - broadest_concepts.sort() - selected_concepts = broadest_concepts - elif select=="leaves": - print("Selecting leaves concepts (those not linked as broader of any other concept)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - broad_concepts = set(g.objects(None, SKOS.broader)) - leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] - leave_concepts.sort() - selected_concepts = leave_concepts - else: - raise ValueError("Selection policy %s is not currently supported" % select) - - print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) - print("Pickling concept list for faster further requests in %s" % fullpath_pickle) - pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) - - return selected_concepts - -if __name__ == '__main__': - - def single_label_fragment(doclist): - single = [d for d in doclist if len(d.categories) < 2] - final_categories = set([d.categories[0] if d.categories else [] for d in single]) - print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), - len(final_categories), - len(doclist))) - return single, list(final_categories) - - train_years = list(range(1986, 2006)) - test_years = [2006] - cat_policy = 'leaves' - most_common_cat = 300 - # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" - JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" - langs = lang_set['JRC_NLTK'] - cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) - sys.exit() - - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) - test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - training_docs, label_names = single_label_fragment(training_docs) - test_docs, label_namestest = single_label_fragment(test_docs) - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py deleted file mode 100644 index cd4b416..0000000 --- a/src/data/reader/rcv_reader.py +++ /dev/null @@ -1,225 +0,0 @@ -from zipfile import ZipFile -import xml.etree.ElementTree as ET -from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS -from util.file import list_files -from sklearn.datasets import get_data_home -import gzip -from os.path import join, exists -from util.file import download_file_if_not_exists -import re -from collections import Counter -import numpy as np -import sys - -""" -RCV2's Nomenclature: -ru = Russian -da = Danish -de = German -es = Spanish -lat = Spanish Latin-American (actually is also 'es' in the collection) -fr = French -it = Italian -nl = Dutch -pt = Portuguese -sv = Swedish -ja = Japanese -htw = Chinese -no = Norwegian -""" - -RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" -RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' -RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" -RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" - -rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', - 'lyrl2004_tokens_test_pt1.dat.gz', - 'lyrl2004_tokens_test_pt2.dat.gz', - 'lyrl2004_tokens_test_pt3.dat.gz'] - -rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] - -rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' - -RCV2_LANG_DIR = {'ru':'REUTE000', - 'de':'REUTE00A', - 'fr':'REUTE00B', - 'sv':'REUTE001', - 'no':'REUTE002', - 'da':'REUTE003', - 'pt':'REUTE004', - 'it':'REUTE005', - 'es':'REUTE006', - 'lat':'REUTE007', - 'jp':'REUTE008', - 'htw':'REUTE009', - 'nl':'REUTERS_'} - - -class RCV_Document: - - def __init__(self, id, text, categories, date='', lang=None): - self.id = id - self.date = date - self.lang = lang - self.text = text - self.categories = categories - - -class ExpectedLanguageException(Exception): pass -class IDRangeException(Exception): pass - - -nwords = [] - -def parse_document(xml_content, assert_lang=None, valid_id_range=None): - root = ET.fromstring(xml_content) - if assert_lang: - if assert_lang not in root.attrib.values(): - if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' - raise ExpectedLanguageException('error: document of a different language') - - doc_id = root.attrib['itemid'] - if valid_id_range is not None: - if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: - raise IDRangeException - - doc_categories = [cat.attrib['code'] for cat in - root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] - - doc_date = root.attrib['date'] - doc_title = root.find('.//title').text - doc_headline = root.find('.//headline').text - doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) - - if not doc_body: - raise ValueError('Empty document') - - if doc_title is None: doc_title = '' - if doc_headline is None or doc_headline in doc_title: doc_headline = '' - text = '\n'.join([doc_title, doc_headline, doc_body]).strip() - - text_length = len(text.split()) - global nwords - nwords.append(text_length) - - return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) - - -def fetch_RCV1(data_path, split='all'): - - assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' - - request = [] - labels = set() - read_documents = 0 - lang = 'en' - - training_documents = 23149 - test_documents = 781265 - - if split == 'all': - split_range = (2286, 810596) - expected = training_documents+test_documents - elif split == 'train': - split_range = (2286, 26150) - expected = training_documents - else: - split_range = (26151, 810596) - expected = test_documents - - global nwords - nwords=[] - for part in list_files(data_path): - if not re.match('\d+\.zip', part): continue - target_file = join(data_path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) - labels.update(doc.categories) - request.append(doc) - read_documents += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents'.format(part, len(request)), end='') - if read_documents == expected: break - if read_documents == expected: break - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_RCV2(data_path, languages=None): - - if not languages: - languages = list(RCV2_LANG_DIR.keys()) - else: - assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' - - request = [] - labels = set() - global nwords - nwords=[] - for lang in languages: - path = join(data_path, RCV2_LANG_DIR[lang]) - lang_docs_read = 0 - for part in list_files(path): - target_file = join(path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang) - labels.update(doc.categories) - request.append(doc) - lang_docs_read += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_topic_hierarchy(path, topics='all'): - assert topics in ['all', 'leaves'] - - download_file_if_not_exists(RCV1_TOPICHIER_URL, path) - hierarchy = {} - for line in open(path, 'rt'): - parts = line.strip().split() - parent,child = parts[1],parts[3] - if parent not in hierarchy: - hierarchy[parent]=[] - hierarchy[parent].append(child) - - del hierarchy['None'] - del hierarchy['Root'] - print(hierarchy) - - if topics=='all': - topics = set(hierarchy.keys()) - for parent in hierarchy.keys(): - topics.update(hierarchy[parent]) - return list(topics) - elif topics=='leaves': - parents = set(hierarchy.keys()) - childs = set() - for parent in hierarchy.keys(): - childs.update(hierarchy[parent]) - return list(childs.difference(parents)) - - diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py deleted file mode 100644 index 83e11e3..0000000 --- a/src/data/reader/wikipedia_tools.py +++ /dev/null @@ -1,304 +0,0 @@ -from __future__ import print_function -# import ijson -# from ijson.common import ObjectBuilder -import os, sys -from os.path import join -from bz2 import BZ2File -import pickle -from util.file import list_dirs, list_files, makedirs_if_not_exist -from itertools import islice -import re -from xml.sax.saxutils import escape -import numpy as np - -policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] - -""" -This file contains a set of tools for processing the Wikipedia multilingual documents. -In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) -and have processed each document to clean their texts with one of the tools: - - https://github.com/aesuli/wikipediatools (Python 2) - - https://github.com/aesuli/wikipedia-extractor (Python 3) -It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) - -This tools help you in: - - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. - Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" - extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). - Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". - - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. - - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all - language-specific versions from the document. - - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, - in a way that the i-th element from any list refers to the same element in the respective language. -""" - -def _doc_generator(text_path, langs): - dotspace = re.compile(r'\.(?!\s)') - for l,lang in enumerate(langs): - print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) - lang_dir = join(text_path, lang) - split_dirs = list_dirs(lang_dir) - for sd,split_dir in enumerate(split_dirs): - print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) - split_files = list_files(join(lang_dir, split_dir)) - for sf,split_file in enumerate(split_files): - print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) - with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: - while True: - doc_lines = list(islice(fi, 3)) - if doc_lines: - # some sentences are not followed by a space after the dot - doc_lines[1] = dotspace.sub('. ', doc_lines[1]) - # [workaround] I found   html symbol was not treated, and unescaping it now might not help... - doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) - yield doc_lines, lang - else: break - -def _extract_title(doc_lines): - m = re.search('title="(.+?)"', doc_lines[0]) - if m: return m.group(1).decode('utf-8') - else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) - -def _create_doc(target_file, id, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) - with open(target_file, 'w') as fo: - fo.write('\n'%id) - [fo.write(line) for line in doc] - fo.write('') - -def _append_doc(target_file, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) - with open(target_file, 'r', buffering=1024*1024) as fi: - lines = fi.readlines() - if doc[0] in lines[1::3]: - return - lines[-1:-1]=doc - with open(target_file, 'w', buffering=1024*1024) as fo: - [fo.write(line) for line in lines] - -def extract_multilingual_documents(inv_dict, langs, text_path, out_path): - if not os.path.exists(out_path): - os.makedirs(out_path) - for lang in langs: - if lang not in inv_dict: - raise ValueError("Lang %s is not in the dictionary" % lang) - - docs_created = len(list_files(out_path)) - print("%d multilingual documents found." % docs_created) - for doc,lang in _doc_generator(text_path, langs): - title = _extract_title(doc) - - if title in inv_dict[lang]: - #pass - ids = inv_dict[lang][title] - for id in ids: - target_file = join(out_path, id) + ".xml" - if os.path.exists(target_file): - _append_doc(target_file, doc, lang) - else: - _create_doc(target_file, id, doc, lang) - docs_created+=1 - else: - if not re.match('[A-Za-z]+', title): - print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) - - - -def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): - simplified_file = join(data_dir,filename) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy - pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") - pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") - if os.path.exists(pickle_invdict): - if return_both and os.path.exists(pickle_dict): - print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) - return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) - elif return_both==False: - print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) - return pickle.load(open(pickle_invdict, 'rb')) - - multiling_titles = {} - inv_dict = {lang:{} for lang in langs} - - def process_entry(line): - parts = line.strip().split('\t') - id = parts[0] - if id in multiling_titles: - raise ValueError("id <%s> already indexed" % id) - - titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) - for lang in titles.keys(): - if lang not in langs: - del titles[lang] - - if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ - or (policy == "IN_ANY_LANG" and len(titles) > 0): - multiling_titles[id] = titles - for lang, title in titles.items(): - if title in inv_dict[lang]: - inv_dict[lang][title].append(id) - inv_dict[lang][title] = [id] - - with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: - completed = 0 - try: - for line in fi: - process_entry(line) - completed += 1 - if completed % 10 == 0: - print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") - print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") - except EOFError: - print("\nUnexpected file ending... saving anyway") - - print("Pickling dictionaries in %s" % data_dir) - pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) - print("Done") - - return (multiling_titles, inv_dict) if return_both else inv_dict - - -# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 -def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): - latest_all_json_file = join(data_dir,json_file) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) - - def process_entry(last, fo): - global written - id = last["id"] - titles = None - if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): - titles = {lang: last["labels"][lang]["value"] for lang in langs} - elif policy == "IN_ANY_LANG": - titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} - - if titles: - fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) - return True - else: - return False - - written = 0 - with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ - BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: - builder = ObjectBuilder() - completed = 0 - for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): - builder.event(event, value) - if len(builder.value)>1: - if process_entry(builder.value.pop(0), fo): written += 1 - completed += 1 - print("\rCompleted %d\ttitles %d" % (completed,written), end="") - print("") - - #process the last entry - process_entry(builder.value.pop(0)) - - return simple_titles_path - -""" -Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the -specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- -specific version of the same document. Documents are forced to contain version in all specified languages and to contain -a minimum number of words; otherwise it is discarded. -""" -class MinWordsNotReached(Exception): pass -class WrongDocumentFormat(Exception): pass - -def _load_multilang_doc(path, langs, min_words=100): - import xml.etree.ElementTree as ET - from xml.etree.ElementTree import Element, ParseError - try: - root = ET.parse(path).getroot() - doc = {} - for lang in langs: - doc_body = root.find('.//doc[@lang="' + lang + '"]') - if isinstance(doc_body, Element): - n_words = len(doc_body.text.split(' ')) - if n_words >= min_words: - doc[lang] = doc_body.text - else: - raise MinWordsNotReached - else: - raise WrongDocumentFormat - except ParseError: - raise WrongDocumentFormat - return doc - -#returns the multilingual documents mapped by language, and a counter with the number of documents readed -def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): - if pickle_name and os.path.exists(pickle_name): - print("unpickling %s" % pickle_name) - return pickle.load(open(pickle_name, 'rb')) - - multi_docs = list_files(wiki_multi_path) - mling_documents = {l:[] for l in langs} - valid_documents = 0 - minwords_exception = 0 - wrongdoc_exception = 0 - for d,multi_doc in enumerate(multi_docs): - print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % - (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") - doc_path = join(wiki_multi_path, multi_doc) - try: - m_doc = _load_multilang_doc(doc_path, langs, min_words) - valid_documents += 1 - for l in langs: - mling_documents[l].append(m_doc[l]) - except MinWordsNotReached: - minwords_exception += 1 - if deletions: os.remove(doc_path) - except WrongDocumentFormat: - wrongdoc_exception += 1 - if deletions: os.remove(doc_path) - if max_documents>0 and valid_documents>=max_documents: - break - - if pickle_name: - print("Pickling wikipedia documents object in %s" % pickle_name) - pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - - return mling_documents - -def random_wiki_sample(l_wiki, max_documents): - if max_documents == 0: return None - langs = list(l_wiki.keys()) - assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' - ndocs_per_lang = len(l_wiki[langs[0]]) - if ndocs_per_lang > max_documents: - sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) - for lang in langs: - l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] - return l_wiki - - -if __name__ == "__main__": - - wikipedia_home = "../Datasets/Wikipedia" - - from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs - langs = frozenset(langs) - - simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") - _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') - extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), - out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) - - diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py deleted file mode 100644 index 1a6e3ae..0000000 --- a/src/data/text_preprocessor.py +++ /dev/null @@ -1,33 +0,0 @@ -from nltk.corpus import stopwords -from data.languages import NLTK_LANGMAP -from nltk import word_tokenize -from nltk.stem import SnowballStemmer - - -def preprocess_documents(documents, lang): - tokens = NLTKStemTokenizer(lang, verbose=True) - sw = stopwords.words(NLTK_LANGMAP[lang]) - return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] - - -class NLTKStemTokenizer(object): - - def __init__(self, lang, verbose=False): - if lang not in NLTK_LANGMAP: - raise ValueError('Language %s is not supported in NLTK' % lang) - self.verbose=verbose - self.called = 0 - self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) - self.cache = {} - - def __call__(self, doc): - self.called += 1 - if self.verbose: - print("\r\t\t[documents processed %d]" % (self.called), end="") - tokens = word_tokenize(doc) - stems = [] - for t in tokens: - if t not in self.cache: - self.cache[t] = self.wnl.stem(t) - stems.append(self.cache[t]) - return stems \ No newline at end of file diff --git a/src/data/tsr_function__.py b/src/data/tsr_function__.py deleted file mode 100755 index 0af8690..0000000 --- a/src/data/tsr_function__.py +++ /dev/null @@ -1,270 +0,0 @@ -import math -import numpy as np -from scipy.stats import t -from joblib import Parallel, delayed -from scipy.sparse import csr_matrix, csc_matrix - - -def get_probs(tpr, fpr, pc): - # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) - # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) - pnc = 1.0 - pc - tp = tpr * pc - fn = pc - tp - fp = fpr * pnc - tn = pnc - fp - return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) - - -def apply_tsr(tpr, fpr, pc, tsr): - cell = get_probs(tpr, fpr, pc) - return tsr(cell) - - -def positive_information_gain(cell): - if cell.tpr() < cell.fpr(): - return 0.0 - else: - return information_gain(cell) - - -def posneg_information_gain(cell): - ig = information_gain(cell) - if cell.tpr() < cell.fpr(): - return -ig - else: - return ig - - -def __ig_factor(p_tc, p_t, p_c): - den = p_t * p_c - if den != 0.0 and p_tc != 0: - return p_tc * math.log(p_tc / den, 2) - else: - return 0.0 - - -def information_gain(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ - __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ - __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ - __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) - - -def information_gain_mod(cell): - return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ - - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) - - -def pointwise_mutual_information(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) - - -def gain_ratio(cell): - pc = cell.p_c() - pnc = 1.0 - pc - norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) - return information_gain(cell) / (-norm) - - -def chi_square(cell): - den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() - if den==0.0: return 0.0 - num = gss(cell)**2 - return num / den - - -def relevance_frequency(cell): - a = cell.tp - c = cell.fp - if c == 0: c = 1 - return math.log(2.0 + (a * 1.0 / c), 2) - - -def idf(cell): - if cell.p_f()>0: - return math.log(1.0 / cell.p_f()) - return 0.0 - - -def gss(cell): - return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() - - -def conf_interval(xt, n): - if n>30: - z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 - else: - z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 - p = (xt + 0.5 * z2) / (n + z2) - amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) - return p, amplitude - -def strength(minPosRelFreq, minPos, maxNeg): - if minPos > maxNeg: - return math.log(2.0 * minPosRelFreq, 2.0) - else: - return 0.0 - - -#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) -#however, for some extremely imbalanced dataset caused all documents to be 0 -def conf_weight(cell, cancel_features=False): - c = cell.get_c() - not_c = cell.get_not_c() - tp = cell.tp - fp = cell.fp - - pos_p, pos_amp = conf_interval(tp, c) - neg_p, neg_amp = conf_interval(fp, not_c) - - min_pos = pos_p-pos_amp - max_neg = neg_p+neg_amp - den = (min_pos + max_neg) - minpos_relfreq = min_pos / (den if den != 0 else 1) - - str_tplus = strength(minpos_relfreq, min_pos, max_neg); - - if str_tplus == 0 and not cancel_features: - return 1e-20 - - return str_tplus; - - -class ContTable: - - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - -def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): - print(f'[selectiong {k} terms]') - nC = Y.shape[1] - FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T - best_features_idx = np.argsort(-FC, axis=0).flatten() - tsr_values = FC.flatten() - selected_indexes_set = set() - selected_indexes = list() - selected_value = list() - from_category = list() - round_robin = iter(best_features_idx) - values_iter = iter(tsr_values) - round=0 - while len(selected_indexes) < k: - term_idx = next(round_robin) - term_val = next(values_iter) - if term_idx not in selected_indexes_set: - selected_indexes_set.add(term_idx) - selected_indexes.append(term_idx) - selected_value.append(term_val) - from_category.append(round) - round = (round + 1) % nC - return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) - - -def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): - tp_ = len(positive_document_indexes & feature_document_indexes) - fp_ = len(feature_document_indexes - positive_document_indexes) - fn_ = len(positive_document_indexes - feature_document_indexes) - tn_ = nD - (tp_ + fp_ + fn_) - return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) - - -def category_tables(feature_sets, category_sets, c, nD, nF): - return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] - - -""" -Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. -Efficiency O(nF x nC x log(S)) where S is the sparse factor -""" -def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): - nD, nF = coocurrence_matrix.shape - nD2, nC = label_matrix.shape - - if nD != nD2: - raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % - (coocurrence_matrix.shape,label_matrix.shape)) - - def nonzero_set(matrix, col): - return set(matrix[:, col].nonzero()[0]) - - if isinstance(coocurrence_matrix, csr_matrix): - coocurrence_matrix = csc_matrix(coocurrence_matrix) - feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] - category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] - cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) - return np.array(cell_matrix) - -# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f -def get_tsr_matrix(cell_matrix, tsr_score_funtion): - nC,nF = cell_matrix.shape - tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] - return np.array(tsr_matrix) - - -""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can -take as input any real-valued feature column (e.g., tf-idf weights). -feat is the feature vector, and c is a binary classification vector. -This implementation covers only the binary case, while the formula is defined for multiclass -single-label scenarios, for which the version [2] might be preferred. -[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. -[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. -""" -def fisher_score_binary(feat, c): - neg = np.ones_like(c) - c - - npos = np.sum(c) - nneg = np.sum(neg) - - mupos = np.mean(feat[c == 1]) - muneg = np.mean(feat[neg == 1]) - mu = np.mean(feat) - - stdpos = np.std(feat[c == 1]) - stdneg = np.std(feat[neg == 1]) - - num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) - den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) - - if den>0: - return num / den - else: - return num diff --git a/src/dataset_builder.py b/src/dataset_builder.py deleted file mode 100644 index b9650c7..0000000 --- a/src/dataset_builder.py +++ /dev/null @@ -1,710 +0,0 @@ -from os.path import join, exists -from nltk.corpus import stopwords -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.preprocessing import MultiLabelBinarizer -from data.reader.jrcacquis_reader import * -from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents -import pickle -import numpy as np -from sklearn.model_selection import train_test_split -from scipy.sparse import issparse -import itertools -from tqdm import tqdm -import re -from scipy.sparse import csr_matrix - - -class MultilingualDataset: - """ - A multilingual dataset is a dictionary of training and test documents indexed by language code. - Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the - documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the - labels of each document, and ids is a list of document-identifiers from the original collection. - """ - - def __init__(self): - self.dataset_name = "" - self.multiling_dataset = {} - - def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): - self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) - - def save(self, file): - self.sort_indexes() - pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) - return self - - def __getitem__(self, item): - if item in self.langs(): - return self.multiling_dataset[item] - return None - - @classmethod - def load(cls, file): - data = pickle.load(open(file, 'rb')) - data.sort_indexes() - return data - - @classmethod - def load_ids(cls, file): - data = pickle.load(open(file, 'rb')) - tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} - te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} - return tr_ids, te_ids - - def sort_indexes(self): - for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): - if issparse(Xtr): Xtr.sort_indices() - if issparse(Xte): Xte.sort_indices() - - def set_view(self, categories=None, languages=None): - if categories is not None: - if isinstance(categories, int): - categories = np.array([categories]) - elif isinstance(categories, list): - categories = np.array(categories) - self.categories_view = categories - if languages is not None: - self.languages_view = languages - - def training(self, mask_numbers=False, target_as_csr=False): - return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) - - def test(self, mask_numbers=False, target_as_csr=False): - return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) - - def lXtr(self, mask_numbers=False): - proc = lambda x:_mask_numbers(x) if mask_numbers else x - # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - - def lXte(self, mask_numbers=False): - proc = lambda x: _mask_numbers(x) if mask_numbers else x - # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} - - def lYtr(self, as_csr=False): - lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def lYte(self, as_csr=False): - lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def cat_view(self, Y): - if hasattr(self, 'categories_view'): - return Y[:,self.categories_view] - else: - return Y - - def langs(self): - if hasattr(self, 'languages_view'): - langs = self.languages_view - else: - langs = sorted(self.multiling_dataset.keys()) - return langs - - def num_categories(self): - return self.lYtr()[self.langs()[0]].shape[1] - - def show_dimensions(self): - def shape(X): - return X.shape if hasattr(X, 'shape') else len(X) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) - - def show_category_prevalences(self): - nC = self.num_categories() - accum_tr = np.zeros(nC, dtype=np.int) - accum_te = np.zeros(nC, dtype=np.int) - in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - prev_train = np.sum(self.cat_view(Ytr), axis=0) - prev_test = np.sum(self.cat_view(Yte), axis=0) - accum_tr += prev_train - accum_te += prev_test - in_langs += (prev_train>0)*1 - print(lang+'-train', prev_train) - print(lang+'-test', prev_test) - print('all-train', accum_tr) - print('all-test', accum_te) - - return accum_tr, accum_te, in_langs - - def set_labels(self, labels): - self.labels = labels - -def _mask_numbers(data): - mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') - mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') - mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') - mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') - mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - text = ' ' + text - text = mask_moredigit.sub(' MoreDigitMask', text) - text = mask_4digit.sub(' FourDigitMask', text) - text = mask_3digit.sub(' ThreeDigitMask', text) - text = mask_2digit.sub(' TwoDigitMask', text) - text = mask_1digit.sub(' OneDigitMask', text) - masked.append(text.replace('.','').replace(',','').strip()) - return masked - - - - -# ---------------------------------------------------------------------------------------------------------------------- -# Helpers -# ---------------------------------------------------------------------------------------------------------------------- -def get_active_labels(doclist): - cat_list = set() - for d in doclist: - cat_list.update(d.categories) - return list(cat_list) - -def filter_by_categories(doclist, keep_categories): - catset = frozenset(keep_categories) - for d in doclist: - d.categories = list(set(d.categories).intersection(catset)) - -def __years_to_str(years): - if isinstance(years, list): - if len(years) > 1: - return str(years[0])+'-'+str(years[-1]) - return str(years[0]) - return str(years) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Matrix builders -# ---------------------------------------------------------------------------------------------------------------------- -def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are independent of each other, - i.e., each language-specific matrix lies in a dedicate feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - lW = {} - - multilingual_dataset = MultilingualDataset() - multilingual_dataset.dataset_name = dataset_name - multilingual_dataset.set_labels(mlb.classes_) - for lang in langs: - print("\nprocessing %d training, %d test, %d wiki for language <%s>" % - (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) - - tr_data, tr_labels, IDtr = zip(*training_docs[lang]) - te_data, te_labels, IDte = zip(*test_docs[lang]) - - if preprocess: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, - tokenizer=NLTKStemTokenizer(lang, verbose=True), - stop_words=stopwords.words(NLTK_LANGMAP[lang])) - else: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - - Xtr = tfidf.fit_transform(tr_data) - Xte = tfidf.transform(te_data) - if wiki_docs: - lW[lang] = tfidf.transform(wiki_docs[lang]) - - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - - multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - multilingual_dataset.show_dimensions() - multilingual_dataset.show_category_prevalences() - - if wiki_docs: - return multilingual_dataset, lW - else: - return multilingual_dataset - - -# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space -def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, - since all of them lie on the same yuxtaposed feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - multiling_dataset = MultilingualDataset() - multiling_dataset.dataset_name = dataset_name - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - multiling_dataset.set_labels(mlb.classes_) - - tr_data_stack = [] - for lang in langs: - print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) - tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) - te_data, te_labels, te_ID = zip(*test_docs[lang]) - if preprocess: - tr_data = preprocess_documents(tr_data, lang) - te_data = preprocess_documents(te_data, lang) - tr_data_stack.extend(tr_data) - multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) - - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - tfidf.fit(tr_data_stack) - - for lang in langs: - print("\nweighting documents for language <%s>" % (lang)) - (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] - Xtr = tfidf.transform(tr_data) - Xte = tfidf.transform(te_data) - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) - - multiling_dataset.show_dimensions() - return multiling_dataset - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to recover the original documents from the MultilingualDataset's ids -# ---------------------------------------------------------------------------------------------------------------------- -""" -This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent -article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents -from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath -""" -def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - all_docs = rcv1_documents + rcv2_documents - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -""" -Same thing but for JRC-Acquis -""" -def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - def filter_by_id(doclist, ids): - ids_set = frozenset(itertools.chain.from_iterable(ids.values())) - return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] - - training_docs = filter_by_id(training_docs, tr_ids) - test_docs = filter_by_id(test_docs, te_ids) - - print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -# ---------------------------------------------------------------------------------------------------------------------- -# Dataset Generators -# ---------------------------------------------------------------------------------------------------------------------- -def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - - - """ - Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - In all cases, training documents are strictly non-parallel, and test documents are strictly parallel - :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where - all splits will be generated - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_years: a list of ints containing the years to be considered as training documents - :param test_years: a list of ints containing the years to be considered as test documents - :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" - (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the - leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details - :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - name = 'JRCacquis' - run = '_run' + str(run) - config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ - 'vs' + __years_to_str(test_years) + \ - '_' + cat_policy + \ - ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ - '_noparallel_processed' - - indep_path = join(jrc_data_home, config_name + run + '.pickle') - upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') - yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') - wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') - wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - print('Generating feature-independent dataset...') - training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) - - def _group_by_lang(doc_list, langs): - return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] - for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) - test_docs = _group_by_lang(test_docs, langs) - if not exists(indep_path): - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) - pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) - - lang_data.save(indep_path) - - print('Generating upper-bound (English-only) dataset...') - if not exists(upper_path): - training_docs_eng_only = {'en':training_docs['en']} - test_docs_eng_only = {'en':test_docs['en']} - build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) - - print('Generating yuxtaposed dataset...') - if not exists(yuxta_path): - build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) - - -def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, - train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - """ - Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - - :param outpath: path where all splits will be dumped - :param rcv1_data_home: path to the RCV1-v2 dataset (English only) - :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_for_lang: maximum number of training documents per language - :param test_for_lang: maximum number of test documents per language - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' - assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' - assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ - "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" - - name = 'RCV1/2' - run = '_run' + str(run) - config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ - ('_processed' if preprocess else '_raw') - - indep_path = join(outpath, config_name + run + '.pickle') - upper_path = join(outpath, config_name + run +'_upper.pickle') - yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') - wiki_path = join(outpath, config_name + run + '.wiki.pickle') - wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents+rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} - - # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there - # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases - print('Generating upper-bound (English-only) dataset...') - train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) - train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} - test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} - build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) - - train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] - for lang in langs: - if lang=='en': continue # already split - test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) - train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) - train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] - test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] - - print('Generating feature-independent dataset...') - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - - lang_data.save(indep_path) - - print('Generating yuxtaposed dataset...') - build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to generate full RCV and JRC datasets -# ---------------------------------------------------------------------------------------------------------------------- -def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): - - - print('fetching the datasets') - rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_train_documents, labels_rcv2) - filter_by_categories(rcv1_test_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_train_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents - lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} - - def get_ids(doclist): - return frozenset([d.id for d in doclist]) - - tr_ids = {'en': get_ids(rcv1_train_documents)} - te_ids = {'en': get_ids(rcv1_test_documents)} - for lang in langs: - if lang == 'en': continue - tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) - - dataset = MultilingualDataset() - dataset.dataset_name = 'RCV1/2-full' - for lang in langs: - print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): - - print('fetching the datasets') - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat - ) - test_docs, _ = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' - ) - - def _group_by_lang(doc_list, langs): - return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - test_docs = _group_by_lang(test_docs, langs) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - data.dataset_name = 'JRC-Acquis-full' - for lang in langs: - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) - Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -#----------------------------------------------------------------------------------------------------------------------- -# MAIN BUILDER -#----------------------------------------------------------------------------------------------------------------------- - -if __name__=='__main__': - import sys - RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = '../Datasets/RCV2' - JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" - full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) - # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) - sys.exit(0) - - # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' - # data = MultilingualDataset.load(datasetpath) - # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' - # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: - # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] - # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) - # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') - # sys.exit(0) - - assert len(sys.argv) == 5, "wrong number of arguments; required: " \ - " " - - JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" - RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' - WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" - - langs = lang_set['JRC_NLTK'] - max_wiki = 5000 - - for run in range(0,10): - print('Building JRC-Acquis datasets run', run) - prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, - train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, - cat_policy='all', most_common_cat=300, run=run) - - print('Building RCV1-v2/2 datasets run', run) - prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], - train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) - - # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE - # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) - # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_','_doclist_') - # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) - - # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_', '_doclist_') - # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) - - - diff --git a/src/embeddings/__init__.py b/src/embeddings/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py deleted file mode 100644 index 27367e9..0000000 --- a/src/embeddings/embeddings.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -from torchtext.vocab import Vectors -import torch -from abc import ABC, abstractmethod -from util.SIF_embed import * - - -class PretrainedEmbeddings(ABC): - - def __init__(self): - super().__init__() - - @abstractmethod - def vocabulary(self): pass - - @abstractmethod - def dim(self): pass - - @classmethod - def reindex(cls, words, word2index): - if isinstance(words, dict): - words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] - - source_idx, target_idx = [], [] - for i, word in enumerate(words): - if word not in word2index: continue - j = word2index[word] - source_idx.append(i) - target_idx.append(j) - source_idx = np.asarray(source_idx) - target_idx = np.asarray(target_idx) - return source_idx, target_idx - - -class FastTextWikiNews(Vectors): - - url_base = 'Cant auto-download MUSE embeddings' - path = '../embeddings/wiki.multi.{}.vec' - _name = '/wiki.multi.{}.vec' - - def __init__(self, cache, language="en", **kwargs): - url = self.url_base.format(language) - name = cache + self._name.format(language) - super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) - - -class FastTextMUSE(PretrainedEmbeddings): - def __init__(self, path, lang, limit=None): - super().__init__() - assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') - self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - - def vocabulary(self): - return set(self.embed.stoi.keys()) - - def dim(self): - return self.embed.dim - - def extract(self, words): - source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) - extraction = torch.zeros((len(words), self.dim())) - extraction[source_idx] = self.embed.vectors[target_idx] - return extraction - - - diff --git a/src/embeddings/pretrained.py b/src/embeddings/pretrained.py deleted file mode 100644 index 026823e..0000000 --- a/src/embeddings/pretrained.py +++ /dev/null @@ -1,102 +0,0 @@ -from abc import ABC, abstractmethod -import torch, torchtext -# import gensim -# import os -import numpy as np - - -# class KeyedVectors: -# -# def __init__(self, word2index, weights): -# assert len(word2index)==weights.shape[0], 'wrong number of dimensions' -# index2word = {i:w for w,i in word2index.items()} -# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' -# self.word2index = word2index -# self.index2word = index2word -# self.weights = weights -# -# def extract(self, words): -# dim = self.weights.shape[1] -# v_size = len(words) -# -# source_idx, target_idx = [], [] -# for i,word in enumerate(words): -# if word not in self.word2index: continue -# j = self.word2index[word] -# source_idx.append(i) -# target_idx.append(j) -# -# extraction = np.zeros((v_size, dim)) -# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] -# -# return extraction - - -# class PretrainedEmbeddings(ABC): -# -# def __init__(self): -# super().__init__() -# -# @abstractmethod -# def vocabulary(self): pass -# -# @abstractmethod -# def dim(self): pass -# -# @classmethod -# def reindex(cls, words, word2index): -# source_idx, target_idx = [], [] -# for i, word in enumerate(words): -# if word not in word2index: continue -# j = word2index[word] -# source_idx.append(i) -# target_idx.append(j) -# source_idx = np.asarray(source_idx) -# target_idx = np.asarray(target_idx) -# return source_idx, target_idx - - -# class GloVe(PretrainedEmbeddings): -# -# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): -# super().__init__() -# print(f'Loading GloVe pretrained vectors from torchtext') -# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) -# print('Done') -# -# def vocabulary(self): -# return set(self.embed.stoi.keys()) -# -# def dim(self): -# return self.embed.dim -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) -# extraction = torch.zeros((len(words), self.dim())) -# extraction[source_idx] = self.embed.vectors[target_idx] -# return extraction - - -# class Word2Vec(PretrainedEmbeddings): -# -# def __init__(self, path, limit=None): -# super().__init__() -# print(f'Loading word2vec pretrained vectors from {path}') -# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') -# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) -# self.word2index={w:i for i,w in enumerate(self.embed.index2word)} -# print('Done') -# -# def vocabulary(self): -# return set(self.word2index.keys()) -# -# def dim(self): -# return self.embed.vector_size -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) -# extraction = np.zeros((len(words), self.dim())) -# extraction[source_idx] = self.embed.vectors[target_idx] -# extraction = torch.from_numpy(extraction).float() -# return extraction - diff --git a/src/embeddings/supervised.py b/src/embeddings/supervised.py deleted file mode 100755 index f84793e..0000000 --- a/src/embeddings/supervised.py +++ /dev/null @@ -1,74 +0,0 @@ -from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -import numpy as np - - -def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur - std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None) - mean = np.mean(x, axis=axis) - return (x - mean) / std - - -def supervised_embeddings_tfidf(X,Y): - tfidf_norm = X.sum(axis=0) - tfidf_norm[tfidf_norm==0] = 1 - F = (X.T).dot(Y) / tfidf_norm.T - return F - - -def supervised_embeddings_ppmi(X,Y): - Xbin = X>0 - D = X.shape[0] - Pxy = (Xbin.T).dot(Y)/D - Px = Xbin.sum(axis=0)/D - Py = Y.sum(axis=0)/D - F = np.asarray(Pxy/(Px.T*Py)) - F = np.maximum(F, 1.0) - F = np.log(F) - return F - - -def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000): - D = X.shape[0] - if D>max_documents: - print(f'sampling {max_documents}') - random_sample = np.random.permutation(D)[:max_documents] - X = X[random_sample] - Y = Y[random_sample] - cell_matrix = get_supervised_matrix(X, Y) - F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T - return F - - -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): - if max_label_space != 0: - print('computing supervised embeddings...') - nC = Y.shape[1] - - if method=='ppmi': - F = supervised_embeddings_ppmi(X, Y) - elif method == 'dotn': - F = supervised_embeddings_tfidf(X, Y) - elif method == 'ig': - F = supervised_embeddings_tsr(X, Y, information_gain) - elif method == 'chi2': - F = supervised_embeddings_tsr(X, Y, chi_square) - - if dozscore: - F = zscores(F, axis=0) - - # Dumping F-matrix for further studies - dump_it = False - if dump_it: - with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: - np.savetxt(outfile, F, delimiter='\t') - with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile: - for token in voc.keys(): - outfile.write(token+'\n') - - return F - - - - - - diff --git a/src/experiment_scripts/10run_dl_jrc.sh b/src/experiment_scripts/10run_dl_jrc.sh deleted file mode 100644 index ce04aa8..0000000 --- a/src/experiment_scripts/10run_dl_jrc.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -logfile=../log/log10run_dl_jrc.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -done \ No newline at end of file diff --git a/src/experiment_scripts/10run_dl_rcv.sh b/src/experiment_scripts/10run_dl_rcv.sh deleted file mode 100644 index 51ca64b..0000000 --- a/src/experiment_scripts/10run_dl_rcv.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=../log/log10run_dl_rcv.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -done diff --git a/src/experiment_scripts/10run_jrc.sh b/src/experiment_scripts/10run_jrc.sh deleted file mode 100644 index 37e3333..0000000 --- a/src/experiment_scripts/10run_jrc.sh +++ /dev/null @@ -1,12 +0,0 @@ -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -logfile=./results/10run_jrc_final_results.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 - -done diff --git a/src/experiment_scripts/10run_jrc_combinations.sh b/src/experiment_scripts/10run_jrc_combinations.sh deleted file mode 100644 index 156a0a5..0000000 --- a/src/experiment_scripts/10run_jrc_combinations.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -logfile=./results/funnelling_10run_jrc_CIKM.csv - -runs='6 7 8 9' #0 1 2 3 4 5 -for run in $runs -do - dataset=$dataset_path$run.pickle - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5) - python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2 - #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2 -done \ No newline at end of file diff --git a/src/experiment_scripts/10run_rcv.sh b/src/experiment_scripts/10run_rcv.sh deleted file mode 100644 index 9d49f94..0000000 --- a/src/experiment_scripts/10run_rcv.sh +++ /dev/null @@ -1,15 +0,0 @@ -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=./results/10run_rcv_final_results.csv - -runs='0 1 2 3 4 5 6 7 8 9' - -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 - -done - - diff --git a/src/experiment_scripts/10run_rcv_combinations.sh b/src/experiment_scripts/10run_rcv_combinations.sh deleted file mode 100644 index b5d8a3b..0000000 --- a/src/experiment_scripts/10run_rcv_combinations.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated - python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated - #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2 - #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2 -done \ No newline at end of file diff --git a/src/experiment_scripts/extract_features.sh b/src/experiment_scripts/extract_features.sh deleted file mode 100644 index d0bd3ac..0000000 --- a/src/experiment_scripts/extract_features.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run# - -runs='1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs - python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath -done - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath \ No newline at end of file diff --git a/src/experiment_scripts/main_deep_learning.py b/src/experiment_scripts/main_deep_learning.py deleted file mode 100755 index ee56054..0000000 --- a/src/experiment_scripts/main_deep_learning.py +++ /dev/null @@ -1,329 +0,0 @@ -import argparse -import torch.nn as nn -from torch.optim.lr_scheduler import StepLR -from dataset_builder import MultilingualDataset -from learning.transformers import load_muse_embeddings -from models.lstm_class import RNNMultilingualClassifier -from util.csv_log import CSVLog -from util.early_stop import EarlyStopping -from util.common import * -from util.file import create_if_not_exist -from time import time -from tqdm import tqdm -from util.evaluation import evaluate -from util.file import get_file_name -# import pickle - -allowed_nets = {'rnn'} - -# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested -def init_Net(nC, multilingual_index, xavier_uniform=True): - net=opt.net - assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}' - - # instantiate the required net - if net=='rnn': - only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised) - if only_post: - print('working on ONLY POST mode') - model = RNNMultilingualClassifier( - output_size=nC, - hidden_size=opt.hidden, - lvocab_size=multilingual_index.l_vocabsize(), - learnable_length=opt.learnable, - lpretrained=multilingual_index.l_embeddings(), - drop_embedding_range=multilingual_index.sup_range, - drop_embedding_prop=opt.sup_drop, - post_probabilities=opt.posteriors, - only_post=only_post, - bert_embeddings=opt.mbert - ) - - # weight initialization - if xavier_uniform: - for p in model.parameters(): - if p.dim() > 1 and p.requires_grad: - nn.init.xavier_uniform_(p) - - if opt.tunable: - # this has to be performed *after* Xavier initialization is done, - # otherwise the pretrained embedding parameters will be overrided - model.finetune_pretrained() - - return model.cuda() - - -def set_method_name(): - method_name = f'{opt.net}(H{opt.hidden})' - if opt.pretrained: - method_name += f'-Muse' - if opt.supervised: - method_name += f'-WCE' - if opt.posteriors: - method_name += f'-Posteriors' - if opt.mbert: - method_name += f'-mBert' - if (opt.pretrained or opt.supervised) and opt.tunable: - method_name += '-(trainable)' - else: - method_name += '-(static)' - if opt.learnable > 0: - method_name += f'-Learnable{opt.learnable}' - return method_name - - -def init_optimizer(model, lr): - return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay) - - -def init_logfile(method_name, opt): - logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile - - -# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise -def load_pretrained_embeddings(we_path, langs): - lpretrained = lpretrained_vocabulary = none_dict(langs) - if opt.pretrained: - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) - lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} - return lpretrained, lpretrained_vocabulary - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): - optim.zero_grad() - # _out = model(batch, post, bert_emb, lang) - loss = criterion(model(batch, post, bert_emb, lang), target) - loss.backward() - clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % opt.log_interval == 0: - interval_loss = np.mean(loss_history[-opt.log_interval:]) - print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): - - loss_history = [] - model.eval() - langs = sorted(ltest_index.keys()) - predictions = {l:[] for l in langs} - yte_stacked = {l:[] for l in langs} - batcher.init_offset() - for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '): - logits = model(batch, post, bert_emb, lang) - loss = criterion(logits, target).item() - prediction = predict(logits) - predictions[lang].append(prediction) - yte_stacked[lang].append(target.detach().cpu().numpy()) - loss_history.append(loss) - - ly = {l:np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l:np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -# ---------------------------------------------------------------------------------------------------------------------- -def main(): - DEBUGGING = False - - method_name = set_method_name() - logfile = init_logfile(method_name, opt) - - # Loading the dataset - data = MultilingualDataset.load(opt.dataset) - # data.set_view(languages=['it', 'fr']) # Testing with less langs - data.show_dimensions() - langs = data.langs() - l_devel_raw, l_devel_target = data.training(target_as_csr=True) - l_test_raw, l_test_target = data.test(target_as_csr=True) - - # Loading the MUSE pretrained embeddings (only if requested) - lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs) - # lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set - - # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs - multilingual_index = MultilingualIndex() - multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary) - multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) - multilingual_index.embedding_matrices(lpretrained, opt.supervised) - if opt.posteriors: - if DEBUGGING: - import pickle - with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile: - data_post = pickle.load(infile) - lPtr = data_post[0] - lPva = data_post[1] - lPte = data_post[2] - print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0') - else: - lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000) - else: - lPtr, lPva, lPte = None, None, None - - if opt.mbert: - _dataset_path = opt.dataset.split('/')[-1].split('_') - _model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '') - # print(f'Model Folder: {_model_folder}') - - if DEBUGGING: - with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile: - data_embed = pickle.load(infile) - tr_bert_embeddings = data_embed[0] - va_bert_embeddings = data_embed[1] - te_bert_embeddings = data_embed[2] - print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0') - else: - tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \ - = multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/') - else: - tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None - - # Model initialization - model = init_Net(data.num_categories(), multilingual_index) - - optim = init_optimizer(model, lr=opt.lr) - criterion = torch.nn.BCEWithLogitsLoss().cuda() - lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) - batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) - batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) - - tinit = time() - create_if_not_exist(opt.checkpoint_dir) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, - checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') - - l_train_index, l_train_target = multilingual_index.l_train() - l_val_index, l_val_target = multilingual_index.l_val() - l_test_index = multilingual_index.l_test_index() - - print('-'*80) - print('Start training') - for epoch in range(1, opt.nepochs + 1): - train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) - lr_scheduler.step() # reduces the learning rate - - # validation - macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va') - early_stop(macrof1, epoch) - if opt.test_each>0: - if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch0: - print(f'running last {opt.val_epochs} training epochs on the validation set') - for val_epoch in range(1, opt.val_epochs + 1): - batcher_train.init_offset() - train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) - - # final test - print('Training complete: testing') - test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te') - - -# ---------------------------------------------------------------------------------------------------------------------- -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings') - parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)') - parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)') - parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)') - parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)') - parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order ' - 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' - 'used to produce plots, and does not perform an evaluation on the test set.') - parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)') - parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)') - parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)') - parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by ' - 'language used to train the calibrated SVMs (only used if --posteriors is active)') - parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status') - parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file') - parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)') - parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints') - parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}') - parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings') - parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings') - parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings') - parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)') - parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the ' - 'validation set once training is over (default 1)') - parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str', - help=f'path to MUSE pretrained embeddings') - parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the ' - 'feature-label embedding (if larger, then PCA with this number of components is applied ' - '(default 300)') - parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run') - parser.add_argument('--tunable', action='store_true', default=False, - help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') - parser.add_argument('--mbert', action='store_true', default=False, - help='use mBert embeddings') - - opt = parser.parse_args() - - assert torch.cuda.is_available(), 'CUDA not available' - assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0' - # if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle') - torch.manual_seed(opt.seed) - - main() diff --git a/src/experiment_scripts/main_embeddings_cls.py b/src/experiment_scripts/main_embeddings_cls.py deleted file mode 100644 index 08552d3..0000000 --- a/src/experiment_scripts/main_embeddings_cls.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from util.evaluation import * -from optparse import OptionParser -from util.file import exists -from util.results import PolylingualClassificationResults -from util.util import get_learner, get_params - -parser = OptionParser() - -parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format", - default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') - -parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') - -parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, - default='MUSE') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. " - "If set to 0 it will automatically search for the best number of components. " - "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", - default=300) - -parser.add_option("-u", "--upca", dest="max_labels_U", type=int, - help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." - " If set to 0 it will automatically search for the best number of components", default=300) - -parser.add_option("-l", dest="lang", type=str) - -if __name__ == '__main__': - (op, args) = parser.parse_args() - - assert exists(op.dataset), 'Unable to find file '+str(op.dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - - dataset_file = os.path.basename(op.dataset) - - results = PolylingualClassificationResults('./results/PLE_results.csv') - - data = MultilingualDataset.load(op.dataset) - data.show_dimensions() - - # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) - # data.set_view(languages=[op.lang]) - # data.set_view(categories=list(range(10))) - lXtr, lytr = data.training() - lXte, lyte = data.test() - - if op.set_c != -1: - meta_parameters = None - else: - meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] - - # Embeddings and WCE config - _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - _available_type = ['MUSE', 'FastText'] - assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' - assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' - - if op.mode_embed == 'none': - config = {'unsupervised': False, - 'supervised': False, - 'we_type': None} - _config_id = 'None' - elif op.mode_embed == 'unsupervised': - config = {'unsupervised': True, - 'supervised': False, - 'we_type': op.we_type} - _config_id = 'M' - elif op.mode_embed == 'supervised': - config = {'unsupervised': False, - 'supervised': True, - 'we_type': None} - _config_id = 'F' - elif op.mode_embed == 'both': - config = {'unsupervised': True, - 'supervised': True, - 'we_type': op.we_type} - _config_id = 'M+F' - - config['reduction'] = 'PCA' - config['max_label_space'] = op.max_labels_S - config['dim_reduction_unsupervised'] = op.max_labels_U - # config['post_pca'] = op.post_pca - # config['plot_covariance_matrices'] = True - - result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '') - - ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', - config = config, - learner=get_learner(calibrate=False), - c_parameters=get_params(dense=False), - n_jobs=op.n_jobs) - - print('# Fitting ...') - ple.fit(lXtr, lytr) - - print('# Evaluating ...') - ple_eval = evaluate_method(ple, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = ple_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row('MLE', 'svm', _config_id, config['we_type'], - 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, - lang, macrof1, microf1, macrok, microk, '') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/experiment_scripts/main_majorityvoting_cls.py b/src/experiment_scripts/main_majorityvoting_cls.py deleted file mode 100644 index ee5efe5..0000000 --- a/src/experiment_scripts/main_majorityvoting_cls.py +++ /dev/null @@ -1,155 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -# from learning.learners import * -# from learning.learners import FunnellingMultimodal -from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting -from util.evaluation import * -from optparse import OptionParser -from util.file import exists -from util.results import PolylingualClassificationResults -from sklearn.svm import SVC - -parser = OptionParser() - -# parser.add_option("-d", "--dataset", dest="dataset", -# help="Path to the multilingual dataset processed and stored in .pickle format", -# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') - -parser.add_option("-P", "--probs", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-S", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=False) - -# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, -# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." -# " If set to 0 it will automatically search for the best number of components", default=300) - -# parser.add_option("-a", dest="post_pca", -# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " -# "embedding space", default=False) - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - -####################################################################################################################### - - -if __name__ == '__main__': - (op, args) = parser.parse_args() - - assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' - dataset = args[0] - - assert exists(dataset), 'Unable to find file '+str(dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' - - dataset_file = os.path.basename(dataset) - - results = PolylingualClassificationResults(op.output) - - data = MultilingualDataset.load(dataset) - data.show_dimensions() - - lXtr, lytr = data.training() - lXte, lyte = data.test() - - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - - # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' - result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \ - f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}' - print(f'{result_id}') - - # text preprocessing - tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - lXtr = tfidfvectorizer.fit_transform(lXtr, lytr) - lXte = tfidfvectorizer.transform(lXte) - lV = tfidfvectorizer.vocabulary() - - classifiers = [] - if op.posteriors: - classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) - if op.supervised: - classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S))) - if op.pretrained: - classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV))) - - classifier = Voting(*classifiers) - - print('# Fitting ...') - classifier.fit(lXtr, lytr) - - print('\n# Evaluating ...') - l_eval = evaluate_method(classifier, lXte, lyte) - - # renaming arguments to be printed on log - _id = '' - _id_conf = [op.posteriors, op.supervised, op.pretrained] - _id_name = ['+P', '+W', '+M'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id.lstrip('+') - _dataset_path = dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - results.add_row(method='Voting', - learner='svm', - optimp=op.optimc, - sif=op.sif, - zscore='todo', - l2='todo', - wescaler='todo', - pca=op.max_labels_S, - id=_id, - dataset=dataset_id, - time='todo', - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/experiment_scripts/main_mbert.py b/src/experiment_scripts/main_mbert.py deleted file mode 100644 index aa44407..0000000 --- a/src/experiment_scripts/main_mbert.py +++ /dev/null @@ -1,390 +0,0 @@ -from dataset_builder import MultilingualDataset -from transformers import BertTokenizer, BertForSequenceClassification, AdamW -from torch.utils.data import Dataset, DataLoader -import numpy as np -import torch -from util.common import predict -from time import time -from util.csv_log import CSVLog -from util.evaluation import evaluate -from util.early_stop import EarlyStopping -from torch.optim.lr_scheduler import StepLR -from sklearn.model_selection import train_test_split -from copy import deepcopy -import argparse -# from torch.utils.tensorboard import SummaryWriter - - -def check_sentences(sentences): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - for sentence in sentences: - converted = [tokenizer._convert_id_to_token(token) for token in sentence.numpy() if token != 0] - print(converted) - return - - -def get_model(n_out): - print('# Initializing model ...') - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) - return model - - -def set_method_name(): - return 'mBERT' - - -def init_optimizer(model, lr): - # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay) - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': opt.weight_decay}, - {'params': [p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': opt.weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - return optimizer - - -def init_logfile(method_name, opt): - logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def get_dataset_name(datapath): - possible_splits = [str(i) for i in range(10)] - splitted = datapath.split('_') - id_split = splitted[-1].split('.')[0][-1] - if id_split in possible_splits: - dataset_name = splitted[0].split('/')[-1] - return f'{dataset_name}_run{id_split}' - elif splitted[-2].split('.')[0] == 'full': - dataset_name = splitted[0].split('/')[-1] - return f'{dataset_name}_fullrun' - - -def load_datasets(datapath): - data = MultilingualDataset.load(datapath) - # data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs - data.show_dimensions() - - l_devel_raw, l_devel_target = data.training(target_as_csr=False) - l_test_raw, l_test_target = data.test(target_as_csr=False) - - return l_devel_raw, l_devel_target, l_test_raw, l_test_target - - -def do_tokenization(l_dataset, max_len=512, verbose=True): - if verbose: - print('# Starting Tokenization ...') - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - langs = l_dataset.keys() - l_tokenized = {} - for lang in langs: - l_tokenized[lang] = tokenizer(l_dataset[lang], - truncation=True, - max_length=max_len, - padding='max_length') - return l_tokenized - - -class TrainingDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data, labels): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _labels = labels[lang] - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.labels = _labels - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.labels = np.vstack((self.labels, _labels)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - y = self.labels[idx] - lang = self.lang_index[idx] - - return x, torch.tensor(y, dtype=torch.float), lang - - def get_lang_ids(self): - return self.lang_ids - - def get_nclasses(self): - if hasattr(self, 'labels'): - return len(self.labels[0]) - else: - print('Method called before init!') - - -def freeze_encoder(model): - for param in model.base_model.parameters(): - param.requires_grad = False - return model - - -def check_param_grad_status(model): - print('#' * 50) - print('Model paramater status:') - for name, child in model.named_children(): - trainable = False - for param in child.parameters(): - if param.requires_grad: - trainable = True - if not trainable: - print(f'{name} is frozen') - else: - print(f'{name} is not frozen') - print('#' * 50) - - -def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - - for idx, (batch, target, lang_idx) in enumerate(train_dataloader): - optim.zero_grad() - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()) - loss.backward() - # clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if writer is not None: - _n_step = (epoch - 1) * (len(train_dataloader)) + idx - writer.add_scalar('Loss_step/Train', loss, _n_step) - - # Check tokenized sentences consistency - # check_sentences(batch.cpu()) - - if idx % opt.log_interval == 0: - interval_loss = np.mean(loss_history[-opt.log_interval:]) - print( - f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer): - print('# Validating model ...') - loss_history = [] - model.eval() - langs = lang_ids.keys() - id_2_lang = {v: k for k, v in lang_ids.items()} - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - - for batch, target, lang_idx in test_dataloader: - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()).item() - prediction = predict(logits) - loss_history.append(loss) - - # Assigning prediction to dict in predictions and yte_stacked according to lang_idx - for i, pred in enumerate(prediction): - lang_pred = id_2_lang[lang_idx.numpy()[i]] - predictions[lang_pred].append(pred) - yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) - - ly = {l: np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l: np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - if writer is not None: - writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch) - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): - l_split_va = deepcopy(l_tokenized_tr) - l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} - l_split_tr = deepcopy(l_tokenized_tr) - l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} - - for lang in l_tokenized_tr.keys(): - val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) - l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ - lang] = \ - train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, - random_state=seed, shuffle=True) - - return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target - - -def main(): - print('Running main ...') - - DATAPATH = opt.dataset - MAX_LEN = 512 - method_name = set_method_name() - logfile = init_logfile(method_name, opt) - - l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) - l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) - - l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, - val_prop=0.2, max_val=2000, - seed=opt.seed) - - l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) - - tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) - va_dataset = TrainingDataset(l_split_va, l_split_val_target) - te_dataset = TrainingDataset(l_tokenized_te, l_test_target) - - tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) - te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False) - - - # Initializing model - nC = tr_dataset.get_nclasses() - model = get_model(nC) - model = model.cuda() - criterion = torch.nn.BCEWithLogitsLoss().cuda() - optim = init_optimizer(model, lr=opt.lr) - lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, - checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}', - is_bert=True) - - # Freezing encoder - # model = freeze_encoder(model) - check_param_grad_status(model) - - # Tensorboard logger - # writer = SummaryWriter('../log/tensorboard_logs/') - - # Training loop - tinit = time() - lang_ids = va_dataset.lang_ids - for epoch in range(1, opt.nepochs + 1): - print('# Start Training ...') - train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None) - lr_scheduler.step() # reduces the learning rate - - # Validation - macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None) - early_stop(macrof1, epoch) - if opt.test_each > 0: - if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or ( - not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs): - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) - - if early_stop.STOP: - print('[early-stop] STOP') - if not opt.plotmode: - break - - if not opt.plotmode: - print('-' * 80) - print('Training over. Performing final evaluation') - - model = early_stop.restore_checkpoint() - model = model.cuda() - - if opt.val_epochs > 0: - print(f'running last {opt.val_epochs} training epochs on the validation set') - for val_epoch in range(1, opt.val_epochs + 1): - train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None) - - # final test - print('Training complete: testing') - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) - - # writer.flush() - # writer.close() - exit('Code Executed!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model') - - parser.add_argument('--dataset', type=str, - default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', - metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--nepochs', type=int, default=200, metavar='int', - help='number of epochs (default: 200)') - parser.add_argument('--lr', type=float, default=2e-5, metavar='float', - help='learning rate (default: 2e-5)') - parser.add_argument('--weight_decay', type=float, default=0, metavar='float', - help='weight decay (default: 0)') - parser.add_argument('--patience', type=int, default=10, metavar='int', - help='patience for early-stop (default: 10)') - parser.add_argument('--log-interval', type=int, default=20, metavar='int', - help='how many batches to wait before printing training status') - parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str', - help='path to the log csv file') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--force', action='store_true', default=False, - help='do not check if this experiment has already been run') - parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', - help='path to the directory containing checkpoints') - parser.add_argument('--plotmode', action='store_true', default=False, - help='in plot mode executes a long run in order ' - 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' - 'used to produce plots, and does not perform an evaluation on the test set.') - parser.add_argument('--test-each', type=int, default=0, metavar='int', - help='how many epochs to wait before invoking test (default: 0, only at the end)') - parser.add_argument('--val-epochs', type=int, default=1, metavar='int', - help='number of training epochs to perform on the validation set once training is over (default 1)') - opt = parser.parse_args() - - # Testing different parameters ... - opt.weight_decay = 0.01 - opt.lr = 1e-5 - opt.patience = 5 - - main() - # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size diff --git a/src/experiment_scripts/main_mbert_extractor.py b/src/experiment_scripts/main_mbert_extractor.py deleted file mode 100644 index 16f09d3..0000000 --- a/src/experiment_scripts/main_mbert_extractor.py +++ /dev/null @@ -1,110 +0,0 @@ -from experiment_scripts.main_mbert import * -import pickle - - -class ExtractorDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - lang = self.lang_index[idx] - - return x, lang - - def get_lang_ids(self): - return self.lang_ids - - -def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'): - print('# Feature Extractor Mode...') - from transformers import BertConfig - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300) - model = BertForSequenceClassification.from_pretrained(model_path, - config=config).cuda() - - """ - Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for - the output of each layer) of shape (batch_size, sequence_length, hidden_size) - """ - all_batch_embeddings = {} - id2lang = {v:k for k,v in lang_ids.items()} - with torch.no_grad(): - for batch, target, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang - - -def main(): - print('Running main ...') - print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}') - DATAPATH = opt.dataset - MAX_LEN = 512 - - l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) - l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) - l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) - - tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target) - tr_lang_ids = tr_dataset.lang_ids - - te_dataset = TrainingDataset(l_tokenized_te, l_test_target) - te_lang_ids = te_dataset.lang_ids - - tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings - te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc - - tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel - with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: - pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile) - - te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test - with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: - pickle.dump((te_all_batch_embeddings, id2lang_te), outfile) - - exit('Extraction completed!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='mBert model document embedding extractor') - - parser.add_argument('--dataset', type=str, - default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', - metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0', - metavar='modelpath', help=f'path to pre-trained mBert model') - opt = parser.parse_args() - - main() - diff --git a/src/experiment_scripts/main_qualitative_analysis.py b/src/experiment_scripts/main_qualitative_analysis.py deleted file mode 100644 index aead994..0000000 --- a/src/experiment_scripts/main_qualitative_analysis.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from optparse import OptionParser -from util.file import exists -import numpy as np -from sklearn.feature_extraction.text import CountVectorizer - -parser = OptionParser(usage="usage: %prog datapath [options]") - -(op, args) = parser.parse_args() -assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' -dataset = args[0] -assert exists(dataset), 'Unable to find file '+str(dataset) - -dataset_file = os.path.basename(dataset) - -data = MultilingualDataset.load(dataset) -data.set_view(languages=['it']) -data.show_dimensions() -lXtr, lytr = data.training() -lXte, lyte = data.test() - -vect_lXtr = dict() -vectorizer = CountVectorizer() -vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it']) -# print(type(vect_lXtr['it'])) - -corr = vect_lXtr['it'].T.dot(lytr['it']) -# print(corr.shape) -sum_correlated_class = corr.sum(axis=0) -print(len(sum_correlated_class)) -print(sum_correlated_class.max()) - - -w2idx = vectorizer.vocabulary_ -idx2w = {v:k for k,v in w2idx.items()} - -word_tot_corr = corr.sum(axis=1) -print(word_tot_corr.shape) -dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)} - -sorted_word_tot_corr = np.sort(word_tot_corr) -sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:] - -top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr] -print([idx2w[idx] for idx in top_idx]) -print([elem for elem in top_idx]) -print(corr[8709]) -print('Finished...') \ No newline at end of file diff --git a/src/experiment_scripts/run_combinations_jrc.sh b/src/experiment_scripts/run_combinations_jrc.sh deleted file mode 100644 index a4aabde..0000000 --- a/src/experiment_scripts/run_combinations_jrc.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -logfile=./results/final_combinations_jrc.csv -#A.2: ensembling feature sets (combinations of posteriors, wce, muse): -# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... -# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) - -# aggregation=concatenation -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 -# - -##FeatureSetToPosteriors (aggregation mean) -python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob - -##FeatureSetToPosteriors -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob - -#MajorityVoting -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r - - diff --git a/src/experiment_scripts/run_combinations_rcv.sh b/src/experiment_scripts/run_combinations_rcv.sh deleted file mode 100644 index 4e1acfb..0000000 --- a/src/experiment_scripts/run_combinations_rcv.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -logfile=./results/final_combinations_rcv.csv -#A.2: ensembling feature sets (combinations of posteriors, wce, muse): -# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... -# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) - -# aggregation=concatenation -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 -# -##FeatureSetToPosteriors (aggregation mean) -python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob - -##FeatureSetToPosteriors -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob - -#MajorityVoting -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_jrc.sh b/src/experiment_scripts/run_dl_jrc.sh deleted file mode 100644 index 1d28e83..0000000 --- a/src/experiment_scripts/run_dl_jrc.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -logfile=../log/log_pre_jrc.csv -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_rcv.sh b/src/experiment_scripts/run_dl_rcv.sh deleted file mode 100644 index 4782887..0000000 --- a/src/experiment_scripts/run_dl_rcv.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_fulljrc_dl.sh b/src/experiment_scripts/run_fulljrc_dl.sh deleted file mode 100644 index 4d5eeaa..0000000 --- a/src/experiment_scripts/run_fulljrc_dl.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -seeds='5' #2 3 4 5 6 7 8 9 10' -for seed in $seeds -do - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed - python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force - - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed - - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force - -done \ No newline at end of file diff --git a/src/experiment_scripts/run_fullrcv_dl.sh b/src/experiment_scripts/run_fullrcv_dl.sh deleted file mode 100644 index 5894aef..0000000 --- a/src/experiment_scripts/run_fullrcv_dl.sh +++ /dev/null @@ -1,20 +0,0 @@ -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -seeds='1 ' #2 3 4 5' # 6 7 8 9 10' -for seed in $seeds -do - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed - python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200 - - - - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed - - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed - -# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed -# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200 - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed -done \ No newline at end of file diff --git a/src/experiment_scripts/run_fun_bert_jrc.sh b/src/experiment_scripts/run_fun_bert_jrc.sh deleted file mode 100644 index fc2e2c3..0000000 --- a/src/experiment_scripts/run_fun_bert_jrc.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -#logfile=../log/log_FunBert_jrc.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable -#done - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -logfile=../log/log_FunBert_fulljrc_static.csv - -python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/experiment_scripts/run_fun_bert_rcv.sh b/src/experiment_scripts/run_fun_bert_rcv.sh deleted file mode 100644 index e27fe54..0000000 --- a/src/experiment_scripts/run_fun_bert_rcv.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -#logfile=../log/log_FunBert_rcv_static.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile -#done - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -logfile=../log/log_FunBert_fullrcv_static.csv - -python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/experiment_scripts/run_mbert_jrc.sh b/src/experiment_scripts/run_mbert_jrc.sh deleted file mode 100644 index 08733a4..0000000 --- a/src/experiment_scripts/run_mbert_jrc.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -#logfile=../log/log_mBert_jrc_NEW.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 -#done - -logfile=../log/log_mBert_fulljrc.csv -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 \ No newline at end of file diff --git a/src/experiment_scripts/run_mbert_rcv.sh b/src/experiment_scripts/run_mbert_rcv.sh deleted file mode 100644 index 66ffba1..0000000 --- a/src/experiment_scripts/run_mbert_rcv.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -#logfile=../log/log_mBert_rcv_NEW.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 -#done - -logfile=../log/log_mBert_fullrcv.csv -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3 \ No newline at end of file diff --git a/src/experiment_scripts/run_traditional_jrc.sh b/src/experiment_scripts/run_traditional_jrc.sh deleted file mode 100644 index 460c9e8..0000000 --- a/src/experiment_scripts/run_traditional_jrc.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle - -######################################## POSTERIORS - # Posteriors -python main_multimodal_cls.py $dataset -P # + zscore -python main_multimodal_cls.py $dataset -P -z # +l2norm -python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight - - -######################################### WCE - #WCE supervised -python main_multimodal_cls.py $dataset -S # + zscore -python main_multimodal_cls.py $dataset -S -z # +l2norm -python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig - - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi - -################################# MUSE - - # MUSE unsupervised -python main_multimodal_cls.py $dataset -U # + zscore -python main_multimodal_cls.py $dataset -U -z # +l2norm -python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/run_traditional_rcv.sh b/src/experiment_scripts/run_traditional_rcv.sh deleted file mode 100644 index 0dcfa2c..0000000 --- a/src/experiment_scripts/run_traditional_rcv.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle - -######################################## POSTERIORS - # Posteriors -python main_multimodal_cls.py $dataset -P # + zscore -python main_multimodal_cls.py $dataset -P -z # +l2norm -python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight - - -######################################### WCE - #WCE supervised -python main_multimodal_cls.py $dataset -S # + zscore -python main_multimodal_cls.py $dataset -S -z # +l2norm -python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig - - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi - -################################# MUSE - - # MUSE unsupervised -python main_multimodal_cls.py $dataset -U # + zscore -python main_multimodal_cls.py $dataset -U -z # +l2norm -python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/time_comparison.sh b/src/experiment_scripts/time_comparison.sh deleted file mode 100644 index 60e1c25..0000000 --- a/src/experiment_scripts/time_comparison.sh +++ /dev/null @@ -1,6 +0,0 @@ -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -seeds='1 2 3 4 5 6 7 8 9 10' -for seed in $seeds -do - python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed - done \ No newline at end of file diff --git a/src/learning/learners.py b/src/learning/learners.py deleted file mode 100644 index 708eaad..0000000 --- a/src/learning/learners.py +++ /dev/null @@ -1,171 +0,0 @@ -import numpy as np -import time -from scipy.sparse import issparse -from sklearn.multiclass import OneVsRestClassifier -from sklearn.model_selection import GridSearchCV -from joblib import Parallel, delayed - - -def _sort_if_sparse(X): - if issparse(X) and not X.has_sorted_indices: - X.sort_indices() - - -def _joblib_transform_multiling(transformer, lX, n_jobs=-1): - if n_jobs == 1: - return {lang:transformer(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) - return {lang: transformations[i] for i, lang in enumerate(langs)} - - -class TrivialRejector: - def fit(self, X, y): - self.cats = y.shape[1] - return self - - def decision_function(self, X): return np.zeros((X.shape[0],self.cats)) - - def predict(self, X): return np.zeros((X.shape[0],self.cats)) - - def predict_proba(self, X): return np.zeros((X.shape[0],self.cats)) - - def best_params(self): return {} - - -class NaivePolylingualClassifier: - """ - Is a mere set of independet MonolingualClassifiers - """ - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.base_learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - - def fit(self, lX, ly): - """ - trains the independent monolingual classifiers - :param lX: a dictionary {language_label: X csr-matrix} - :param ly: a dictionary {language_label: y np.array} - :return: self - """ - tinit = time.time() - assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' - langs = list(lX.keys()) - for lang in langs: - _sort_if_sparse(lX[lang]) - - models = Parallel(n_jobs=self.n_jobs)\ - (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) - - self.model = {lang: models[i] for i, lang in enumerate(langs)} - self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} - self.time = time.time() - tinit - return self - - def decision_function(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of classification scores for each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs=list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) - return {lang:scores[i] for i,lang in enumerate(langs)} - - def predict_proba(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of probabilities that each document belongs to each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs=list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) - return {lang:scores[i] for i,lang in enumerate(langs)} - - def predict(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of predictions - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' - if self.n_jobs == 1: - return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def best_params(self): - return {l:model.best_params() for l,model in self.model.items()} - - -class MonolingualClassifier: - - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - self.best_params_ = None - - def fit(self, X, y): - if X.shape[0] == 0: - print('Warning: X has 0 elements, a trivial rejector will be created') - self.model = TrivialRejector().fit(X,y) - self.empty_categories = np.arange(y.shape[1]) - return self - - tinit = time.time() - _sort_if_sparse(X) - self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten() - - # multi-class format - if len(y.shape) == 2: - if self.parameters is not None: - self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} - for params in self.parameters] - self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) - else: - self.model = self.learner - raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' - 'the labels across languages') - - # parameter optimization? - if self.parameters: - print('debug: optimizing parameters:', self.parameters) - self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, - error_score=0, verbose=10) - - # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') - print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') - self.model.fit(X, y) - if isinstance(self.model, GridSearchCV): - self.best_params_ = self.model.best_params_ - print('best parameters: ', self.best_params_) - self.time = time.time()-tinit - return self - - def decision_function(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.decision_function(X) - - def predict_proba(self, X): - assert self.model is not None, 'predict called before fit' - assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' - _sort_if_sparse(X) - return self.model.predict_proba(X) - - def predict(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.predict(X) - - def best_params(self): - return self.best_params_ \ No newline at end of file diff --git a/src/learning/transformers.py b/src/learning/transformers.py deleted file mode 100644 index 5a76740..0000000 --- a/src/learning/transformers.py +++ /dev/null @@ -1,863 +0,0 @@ -from torch.optim.lr_scheduler import StepLR -from torch.utils.data import DataLoader -from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain -from embeddings.embeddings import FastTextMUSE -from embeddings.supervised import supervised_embeddings_tfidf, zscores -from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling -from sklearn.decomposition import PCA -from scipy.sparse import hstack -from util_transformers.StandardizeTransformer import StandardizeTransformer -from util.SIF_embed import remove_pc -from sklearn.preprocessing import normalize -from scipy.sparse import csr_matrix -from models.mBert import * -from models.lstm_class import * -from util.csv_log import CSVLog -from util.file import get_file_name, create_if_not_exist, exists -from util.early_stop import EarlyStopping -from util.common import * -import pickle -import time - - -# ------------------------------------------------------------------ -# Data Processing -# ------------------------------------------------------------------ - - -class FeatureWeight: - - def __init__(self, weight='tfidf', agg='mean'): - assert weight in ['tfidf', 'pmi', 'ig'] or callable( - weight), 'weight should either be "tfidf" or a callable function' - assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"' - self.weight = weight - self.agg = agg - self.fitted = False - if weight == 'pmi': - self.weight = pointwise_mutual_information - elif weight == 'ig': - self.weight = information_gain - - def fit(self, lX, ly): - if not self.fitted: - if self.weight == 'tfidf': - self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()} - else: - self.lF = {} - for l in lX.keys(): - X, y = lX[l], ly[l] - - print(f'getting supervised cell-matrix lang {l}') - tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight) - if self.agg == 'max': - F = tsr_matrix.max(axis=0) - elif self.agg == 'mean': - F = tsr_matrix.mean(axis=0) - self.lF[l] = F - self.fitted = True - return self - - def transform(self, lX): - return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()} - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - -# ------------------------------------------------------------------ -# View Generators (aka first-tier learners) -# ------------------------------------------------------------------ - - -class PosteriorProbabilitiesEmbedder: - - def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'): - self.fist_tier_learner = first_tier_learner - self.fist_tier_parameters = first_tier_parameters - self.l2 = l2 - self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier( - self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs - ) - self.requires_tfidf = True - self.storing_path = storing_path - self.is_training = is_training - - def fit(self, lX, lY, lV=None, called_by_viewgen=False): - # if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): - # print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') - # return self - if not called_by_viewgen: - # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) - print('### Posterior Probabilities View Generator (X)') - print('fitting the projectors... {}'.format(lX.keys())) - self.doc_projector.fit(lX, lY) - return self - - def transform(self, lX): - # if dir exist, load and return already computed results - # _endpoint = 'tr' if self.is_training else 'te' - # _actual_path = self.storing_path + '/' + _endpoint - # if exists(_actual_path): - # print('NB: loading pre-computed results!') - # with open(_actual_path + '/X.pickle', 'rb') as infile: - # self.is_training = False - # return pickle.load(infile) - - lZ = self.predict_proba(lX) - lZ = _normalize(lZ, self.l2) - # create dir and dump computed results - # create_if_not_exist(_actual_path) - # with open(_actual_path + '/X.pickle', 'wb') as outfile: - # pickle.dump(lZ, outfile) - self.is_training = False - return lZ - - def fit_transform(self, lX, ly=None, lV=None): - return self.fit(lX, ly).transform(lX) - - def best_params(self): - return self.doc_projector.best_params() - - def predict(self, lX, ly=None): - return self.doc_projector.predict(lX) - - def predict_proba(self, lX, ly=None): - print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') - lZ = self.doc_projector.predict_proba(lX) - return lZ - - -class MuseEmbedder: - - def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False): - self.path = path - self.lV = lV - self.l2 = l2 - self.n_jobs = n_jobs - self.featureweight = featureweight - self.sif = sif - self.requires_tfidf = True - - def fit(self, lX, ly, lV=None): - assert lV is not None or self.lV is not None, 'lV not specified' - print('### MUSE View Generator (M)') - print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...') - self.langs = sorted(lX.keys()) - self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) - lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs} - self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()} - self.featureweight.fit(lX, ly) - return self - - def transform(self, lX): - MUSE = self.MUSE - lX = self.featureweight.transform(lX) - XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs) - lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} - lMuse = _normalize(lMuse, self.l2) - return lMuse - - def fit_transform(self, lX, ly, lV): - return self.fit(lX, ly, lV).transform(lX) - - def _get_wordlist_from_word2index(self, word2index): - return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0] - - def _get_output_dim(self): - return self.MUSE['da'].shape[1] - - -class WordClassEmbedder: - - def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False): - self.n_jobs = n_jobs - self.l2 = l2 - self.max_label_space = max_label_space - self.featureweight = featureweight - self.sif = sif - self.requires_tfidf = True - - def fit(self, lX, ly, lV=None): - print('### WCE View Generator (M)') - print('Computing supervised embeddings...') - self.langs = sorted(lX.keys()) - WCE = Parallel(n_jobs=self.n_jobs)( - delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs - ) - self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)} - self.featureweight.fit(lX, ly) - return self - - def transform(self, lX): - lWCE = self.lWCE - lX = self.featureweight.transform(lX) - XdotWCE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs - ) - lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} - lwce = _normalize(lwce, self.l2) - return lwce - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - def _get_output_dim(self): - return 73 # TODO ! - - -class MBertEmbedder: - - def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None, - nC=None, avoid_loading=False): - self.doc_embed_path = doc_embed_path - self.patience = patience - self.checkpoint_dir = checkpoint_dir - self.fitted = False - self.requires_tfidf = False - self.avoid_loading = avoid_loading - if path_to_model is None: - self.model = None - else: - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, - num_labels=nC) - if self.avoid_loading: - self.model = None - else: - self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results! - self.fitted = True - - def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1): - print('### mBERT View Generator (B)') - if self.fitted is True: - print('Bert model already fitted!') - return self - - print('Fine-tune mBert on the given dataset.') - l_tokenized_tr = do_tokenization(lX, max_len=512) - l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly, - val_prop=0.2, max_val=2000, - seed=seed) - - tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) - va_dataset = TrainingDataset(l_split_va, l_split_val_target) - tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True) - - nC = tr_dataset.get_nclasses() - model = get_model(nC) - model = model.cuda() - criterion = torch.nn.BCEWithLogitsLoss().cuda() - optim = init_optimizer(model, lr=lr, weight_decay=0.01) - lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) - early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience, - checkpoint=self.checkpoint_dir, - is_bert=True) - - # Training loop - logfile = '../log/log_mBert_extractor.csv' - method_name = 'mBert_feature_extractor' - - tinit = time() - lang_ids = va_dataset.lang_ids - for epoch in range(1, nepochs + 1): - print('# Start Training ...') - train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile) - lr_scheduler.step() # reduces the learning rate # TODO arg epoch? - - # Validation - macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va') - early_stop(macrof1, epoch) - - if early_stop.STOP: - print('[early-stop] STOP') - break - - model = early_stop.restore_checkpoint() - self.model = model.cuda() - - if val_epochs > 0: - print(f'running last {val_epochs} training epochs on the validation set') - for val_epoch in range(1, val_epochs + 1): - train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile) - - self.fitted = True - return self - - def transform(self, lX): - assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \ - 'pass the "path_to_model" arg.' - print('Obtaining document embeddings from pretrained mBert ') - l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) - feat_dataset = ExtractorDataset(l_tokenized_X) - feat_lang_ids = feat_dataset.lang_ids - dataloader = DataLoader(feat_dataset, batch_size=64) - all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) - return all_batch_embeddings - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - -class RecurrentEmbedder: - - def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3, - we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10, - test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1): - self.pretrained = pretrained - self.supervised = supervised - self.concat = concat - self.requires_tfidf = False - self.multilingual_dataset = multilingual_dataset - self.model = None - self.we_path = we_path - self.langs = multilingual_dataset.langs() - self.hidden_size = hidden_size - self.sup_drop = sup_drop - self.posteriors = posteriors - self.patience = patience - self.checkpoint_dir = checkpoint_dir - self.test_each = test_each - self.options = options - self.seed = options.seed - self.model_path = model_path - self.n_jobs = n_jobs - self.is_trained = False - - ## INIT MODEL for training - self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True) - self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True) - self.nC = self.lyte[self.langs[0]].shape[1] - lpretrained, self.lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs) - self.multilingual_index = MultilingualIndex() - self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, self.lpretrained_vocabulary) - self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) - self.multilingual_index.embedding_matrices(lpretrained, self.supervised) - - if model_path is not None: - self.is_trained = True - self.model = torch.load(model_path) - else: - self.model = self._init_Net() - - self.optim = init_optimizer(self.model, lr=lr) - self.criterion = torch.nn.BCEWithLogitsLoss().cuda() - self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5) - self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, - checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') - - def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1): - print('### Gated Recurrent Unit View Generator (G)') - if self.model is None: - print('TODO: Init model!') - if not self.is_trained: - # Batchify input - self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) - l_train_index, l_train_target = self.multilingual_index.l_train() - l_val_index, l_val_target = self.multilingual_index.l_val() - l_test_index = self.multilingual_index.l_test_index() - batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - - # Train loop - print('Start training') - method_name = 'gru_view_generator' - logfile = init_logfile_nn(method_name, self.options) - tinit = time.time() - for epoch in range(1, nepochs + 1): - train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, - tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, - epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, - ltrain_bert=None) - self.lr_scheduler.step() - - # validation step - macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, - logfile, self.criterion, 'va') - - self.early_stop(macrof1, epoch) - if self.test_each > 0: - test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch, - logfile, self.criterion, 'te') - - if self.early_stop.STOP: - print('[early-stop] STOP') - print('Restoring best model...') - break - - self.model = self.early_stop.restore_checkpoint() - print(f'running last {val_epochs} training epochs on the validation set') - for val_epoch in range(1, val_epochs+1): - batcher_train.init_offset() - train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, - tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, - epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, - ltrain_bert=None) - self.is_trained = True - - return self - - def transform(self, lX, batch_size=64): - lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) - lX = self._get_doc_embeddings(lX, batch_size) - return lX - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - def _get_doc_embeddings(self, lX, batch_size): - assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' - print('Generating document embeddings via GRU') - _lX = {} - - l_devel_target = self.multilingual_index.l_devel_target() - - # show_gpu('RNN init at extraction') - for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target, - batch_size, self.multilingual_index.l_pad())): - if lang not in _lX.keys(): - _lX[lang] = self.model.get_embeddings(batch, lang) - else: - _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0) - # show_gpu('RNN after batch pred at extraction') - return _lX - - # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise - def _load_pretrained_embeddings(self, we_path, langs): - lpretrained = lpretrained_vocabulary = self._none_dict(langs) - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs) - lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} - return lpretrained, lpretrained_vocabulary - - def _none_dict(self, langs): - return {l:None for l in langs} - - # instantiates the net, initializes the model parameters, and sets embeddings trainable if requested - def _init_Net(self, xavier_uniform=True): - model = RNNMultilingualClassifier( - output_size=self.nC, - hidden_size=self.hidden_size, - lvocab_size=self.multilingual_index.l_vocabsize(), - learnable_length=0, - lpretrained=self.multilingual_index.l_embeddings(), - drop_embedding_range=self.multilingual_index.sup_range, - drop_embedding_prop=self.sup_drop, - post_probabilities=self.posteriors - ) - return model.cuda() - - -class DocEmbedderList: - - def __init__(self, *embedder_list, aggregation='concat'): - assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"' - if len(embedder_list) == 0: - embedder_list = [] - self.embedders = embedder_list - self.aggregation = aggregation - print(f'Aggregation mode: {self.aggregation}') - - def fit(self, lX, ly, lV=None, tfidf=None): - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - transformer.fit(_lX, ly, lV) - return self - - def transform(self, lX, tfidf=None): - if self.aggregation == 'concat': - return self.transform_concat(lX, tfidf) - elif self.aggregation == 'mean': - return self.transform_mean(lX, tfidf) - - def transform_concat(self, lX, tfidf): - if len(self.embedders) == 1: - if self.embedders[0].requires_tfidf: - lX = tfidf - return self.embedders[0].transform(lX) - - some_sparse = False - langs = sorted(lX.keys()) - - lZparts = {l: [] for l in langs} - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - lZ = transformer.transform(_lX) - for l in langs: - Z = lZ[l] - some_sparse = some_sparse or issparse(Z) - lZparts[l].append(Z) - - hstacker = hstack if some_sparse else np.hstack - return {l: hstacker(lZparts[l]) for l in langs} - - def transform_mean(self, lX, tfidf): - if len(self.embedders) == 1: - if self.embedders[0].requires_tfidf: - lX = tfidf - return self.embedders[0].transform(lX) - - langs = sorted(lX.keys()) - lZparts = {l: None for l in langs} - - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - lZ = transformer.transform(_lX) - for l in langs: - Z = lZ[l] - if lZparts[l] is None: - lZparts[l] = Z - else: - lZparts[l] += Z - - n_transformers = len(self.embedders) - - return {l: lZparts[l] / n_transformers for l in langs} - - def fit_transform(self, lX, ly, lV=None, tfidf=None): - return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf) - - def best_params(self): - return {'todo'} - - def append(self, embedder): - self.embedders.append(embedder) - - -class FeatureSet2Posteriors: - def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'): - self.transformer = transformer - self.l2 = l2 - self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) - self.requires_tfidf = requires_tfidf - - self.storing_path = storing_path - self.is_training = True - self.method_id = method_id - - def fit(self, lX, ly, lV=None): - if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): - print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') - return self - - if lV is None and hasattr(self.transformer, 'lV'): - lV = self.transformer.lV - lZ = self.transformer.fit_transform(lX, ly, lV) - self.prob_classifier.fit(lZ, ly) - return self - - def transform(self, lX): - # if dir exist, load and return already computed results - # _endpoint = 'tr' if self.is_training else 'te' - # _actual_path = self.storing_path + '/' + _endpoint - # if exists(_actual_path): - # print('NB: loading pre-computed results!') - # with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: - # self.is_training = False - # return pickle.load(infile) - - lP = self.predict_proba(lX) - lP = _normalize(lP, self.l2) - # create dir and dump computed results - # create_if_not_exist(_actual_path) - # with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: - # pickle.dump(lP, outfile) - self.is_training = False - return lP - - def fit_transform(self, lX, ly, lV): - return self.fit(lX, ly, lV).transform(lX) - - def predict(self, lX, ly=None): - lZ = self.transformer.transform(lX) - return self.prob_classifier.predict(lZ) - - def predict_proba(self, lX, ly=None): - lZ = self.transformer.transform(lX) - return self.prob_classifier.predict_proba(lZ) - - -# ------------------------------------------------------------------ -# Meta-Classifier (aka second-tier learner) -# ------------------------------------------------------------------ -class MetaClassifier: - - def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): - self.n_jobs = n_jobs - self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) - self.standardize_range = standardize_range - - def fit(self, lZ, ly): - tinit = time.time() - Z, y = self.stack(lZ, ly) - - self.standardizer = StandardizeTransformer(range=self.standardize_range) - Z = self.standardizer.fit_transform(Z) - - print('fitting the Z-space of shape={}'.format(Z.shape)) - self.model.fit(Z, y) - self.time = time.time() - tinit - - def stack(self, lZ, ly=None): - langs = list(lZ.keys()) - Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space - if ly is not None: - y = np.vstack([ly[lang] for lang in langs]) - return Z, y - else: - return Z - - def predict(self, lZ, ly=None): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) - - def predict_proba(self, lZ, ly=None): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) - - def best_params(self): - return self.model.best_params() - - -# ------------------------------------------------------------------ -# Ensembling (aka Funnelling) -# ------------------------------------------------------------------ -class Funnelling: - def __init__(self, - vectorizer: TfidfVectorizerMultilingual, - first_tier: DocEmbedderList, - meta: MetaClassifier): - self.vectorizer = vectorizer - self.first_tier = first_tier - self.meta = meta - self.n_jobs = meta.n_jobs - - def fit(self, lX, ly, target_lang=None): - if target_lang is not None: - LX = lX.copy() - LX.update(target_lang) - self.vectorizer.fit(LX) - tfidf_lX = self.vectorizer.transform(lX) - else: - tfidf_lX = self.vectorizer.fit_transform(lX, ly) - lV = self.vectorizer.vocabulary() - print('## Fitting first-tier learners!') - lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX) - print('## Fitting meta-learner!') - self.meta.fit(lZ, ly) - - def predict(self, lX, ly=None): - tfidf_lX = self.vectorizer.transform(lX) - lZ = self.first_tier.transform(lX, tfidf=tfidf_lX) - ly_ = self.meta.predict(lZ) - return ly_ - - def best_params(self): - return {'1st-tier': self.first_tier.best_params(), - 'meta': self.meta.best_params()} - - -class Voting: - def __init__(self, *prob_classifiers): - assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic' - self.prob_classifiers = prob_classifiers - - def fit(self, lX, ly, lV=None): - for classifier in self.prob_classifiers: - classifier.fit(lX, ly, lV) - - def predict(self, lX, ly=None): - lP = {l: [] for l in lX.keys()} - for classifier in self.prob_classifiers: - lPi = classifier.predict_proba(lX) - for l in lX.keys(): - lP[l].append(lPi[l]) - - lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()} - ly = {l: P > 0.5 for l, P in lP.items()} - - return ly - - -# ------------------------------------------------------------------------------ -# HELPERS -# ------------------------------------------------------------------------------ - -def load_muse_embeddings(we_path, langs, n_jobs=-1): - MUSE = Parallel(n_jobs=n_jobs)( - delayed(FastTextMUSE)(we_path, lang) for lang in langs - ) - return {l: MUSE[i] for i, l in enumerate(langs)} - - -def word_class_embedding_matrix(X, Y, max_label_space=300): - WCE = supervised_embeddings_tfidf(X, Y) - WCE = zscores(WCE, axis=0) - - nC = Y.shape[1] - if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - WCE = pca.fit(WCE).transform(WCE) - - return WCE - - -def XdotM(X, M, sif): - E = X.dot(M) - if sif: - # print("removing pc...") - E = remove_pc(E, npc=1) - return E - - -def _normalize(lX, l2=True): - return {l: normalize(X) for l, X in lX.items()} if l2 else lX - - -class BatchGRU: - def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): - self.batchsize = batchsize - self.batches_per_epoch = batches_per_epoch - self.languages = languages - self.lpad = lpad - self.max_pad_length = max_pad_length - self.init_offset() - - def init_offset(self): - self.offset = {lang: 0 for lang in self.languages} - - def batchify(self, l_index, l_post, l_bert, llabels, extractor=False): - langs = self.languages - l_num_samples = {l: len(l_index[l]) for l in langs} - - max_samples = max(l_num_samples.values()) - n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) - if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: - n_batches = self.batches_per_epoch - - for b in range(n_batches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - offset = self.offset[lang] - if offset >= l_num_samples[lang]: - offset = 0 - limit = offset+self.batchsize - - batch_slice = slice(offset, limit) - batch = index[batch_slice] - batch_labels = labels[batch_slice].toarray() - - post = None - bert_emb = None - - batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) - batch = torch.LongTensor(batch).cuda() - target = torch.FloatTensor(batch_labels).cuda() - - self.offset[lang] = limit - - yield batch, post, bert_emb, target, lang - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i,indexes in enumerate(index_list): - index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] - return index_list - - -def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt, - ltrain_posteriors=None, ltrain_bert=None, log_interval=10): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - # show_gpu('RNN init pre-training') - loss_history = [] - model.train() - for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): - optim.zero_grad() - loss = criterion(model(batch, post, bert_emb, lang), target) - loss.backward() - clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - # show_gpu('RNN after batch prediction') - - if idx % log_interval == 0: - interval_loss = np.mean(loss_history[-log_interval:]) - print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, ' - f'Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit) - return mean_loss - - -def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): - loss_history = [] - model.eval() - langs = sorted(ltest_index.keys()) - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - batcher.init_offset() - for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), - desc='evaluation: '): - logits = model(batch, post, bert_emb, lang) - loss = criterion(logits, target).item() - prediction = predict(logits) - predictions[lang].append(prediction) - yte_stacked[lang].append(target.detach().cpu().numpy()) - loss_history.append(loss) - - ly = {l:np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l:np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit) - - return Mf1 - - -def clip_gradient(model, clip_value=1e-1): - params = list(filter(lambda p: p.grad is not None, model.parameters())) - for p in params: - p.grad.data.clamp_(-clip_value, clip_value) - - -def init_logfile_nn(method_name, opt): - import os - logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', get_method_name(os.path.basename(opt.dataset), opt.posteriors, opt.supervised, opt.pretrained, opt.mbert, - opt.gruViewGenerator, opt.gruMUSE, opt.gruWCE, opt.agg, opt.allprob)) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile diff --git a/src/main_gFun.py b/src/main_gFun.py deleted file mode 100644 index 8694087..0000000 --- a/src/main_gFun.py +++ /dev/null @@ -1,166 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from learning.transformers import * -from util.evaluation import * -from util.file import exists -from util.results import PolylingualClassificationResults -from util.common import * -from util.parser_options import * - -if __name__ == '__main__': - (op, args) = parser.parse_args() - dataset = op.dataset - assert exists(dataset), 'Unable to find file '+str(dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \ - 'empty set of document embeddings is not allowed' - if op.gruViewGenerator: - assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \ - 'explicit initialization of GRU View Generator' - - l2 = op.l2 - dataset_file = os.path.basename(dataset) - results = PolylingualClassificationResults('../log/' + op.output) - allprob = 'Prob' if op.allprob else '' - - method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert, - op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) - - print(f'Method: gFun{method_name}\nDataset: {dataset_name}') - print('-'*50) - - n_jobs = -1 # TODO SETTING n_JOBS - - standardize_range = slice(0, 0) - if op.zscore: - standardize_range = None - - # load dataset - data = MultilingualDataset.load(dataset) - # data.set_view(languages=['it']) # TODO: DEBUG SETTING - data.show_dimensions() - lXtr, lytr = data.training() - lXte, lyte = data.test() - - # text preprocessing - tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - # feature weighting (for word embeddings average) - feat_weighting = FeatureWeight(op.feat_weight, agg='mean') - - # document embedding modules aka View Generators - doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat') - - # init View Generators - if op.posteriors: - """ - View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means - of a set of SVM. - """ - # Check if we already have VG outputs from previous runs - VG_name = 'X' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - exist = exists(storing_path) - doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, - kernel='linear', - C=op.set_c), - l2=l2, storing_path=storing_path, n_jobs=n_jobs)) - - if op.supervised: - """ - View Generator (-W): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - VG_name = 'W' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - exist = exists(storing_path) - wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, - sif=op.sif, n_jobs=n_jobs) - if op.allprob: - wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path, - n_jobs=n_jobs) - doc_embedder.append(wce) - - if op.pretrained: - """ - View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - VG_name = 'M' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - exist = exists(storing_path) - muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs) - if op.allprob: - muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path, - n_jobs=n_jobs) - doc_embedder.append(muse) - - if op.gruViewGenerator: - """ - View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM. - """ - VG_name = 'G' - VG_name += '_muse' if op.gruMUSE else '' - VG_name += '_wce' if op.gruWCE else '' - storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, - options=op, model_path=None, n_jobs=n_jobs) - if op.allprob: - rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False, - storing_path=storing_path, n_jobs=n_jobs) - doc_embedder.append(rnn_embedder) - - if op.mbert: - """ - View generator (-B): generates document embedding via mBERT model. - """ - VG_name = 'B' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run)) - - mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading) - if op.allprob: - mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path) - doc_embedder.append(mbert) - - # metaclassifier - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c), - meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs) - - # ensembling the modules - classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) - - print('\n# Fitting Funnelling Architecture...') - tinit = time.time() - classifier.fit(lXtr, lytr) - time = time.time()-tinit - - print('\n# Evaluating ...') - l_eval = evaluate_method(classifier, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - results.add_row(method='MultiModal', - learner='SVM', - optimp=op.optimc, - sif=op.sif, - zscore=op.zscore, - l2=op.l2, - wescaler=op.feat_weight, - pca=op.max_labels_S, - id=method_name, - dataset=dataset_name, - time=time, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) diff --git a/src/models/cnn_class_bu.py b/src/models/cnn_class_bu.py deleted file mode 100644 index a47d5fc..0000000 --- a/src/models/cnn_class_bu.py +++ /dev/null @@ -1,42 +0,0 @@ -import torch.nn as nn -from torch.nn import functional as F -import torch - -class CNN_pdr(nn.Module): - - def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None, - drop_embedding_prop=0, drop_prob=0.5): - super(CNN_pdr, self).__init__() - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.embeddings = torch.FloatTensor(embeddings) - self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings) - self.kernel_heights = kernel_heights=[3,5,7] - self.stride = 1 - self.padding = 0 - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - self.nC = 73 - - self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding) - self.dropout = nn.Dropout(drop_prob) - self.label = nn.Linear(len(kernel_heights) * out_channels, output_size) - self.fC = nn.Linear(compositional_dim + self.nC, self.nC) - - - def forward(self, x, svm_output): - x = torch.LongTensor(x) - svm_output = torch.FloatTensor(svm_output) - x = self.embedding_layer(x) - x = self.conv1(x.unsqueeze(1)) - x = F.relu(x.squeeze(3)) - x = F.max_pool1d(x, x.size()[2]).squeeze(2) - x = torch.cat((x, svm_output), 1) - x = F.sigmoid(self.fC(x)) - return x #.detach().numpy() - - # logits = self.label(x) - # return logits - - diff --git a/src/models/helpers.py b/src/models/helpers.py deleted file mode 100755 index 93e5805..0000000 --- a/src/models/helpers.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch -import torch.nn as nn -from torch.nn import functional as F - - - -def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): - pretrained_embeddings = None - pretrained_length = 0 - if pretrained is not None: - pretrained_length = pretrained.shape[1] - assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' - pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) - pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - # pretrained_embeddings.to(device) - - learnable_embeddings = None - if learnable_length > 0: - learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - # learnable_embeddings.to(device) - - embedding_length = learnable_length + pretrained_length - assert embedding_length > 0, '0-size embeddings' - - return pretrained_embeddings, learnable_embeddings, embedding_length - - -def embed(model, input, lang): - input_list = [] - if model.lpretrained_embeddings[lang]: - input_list.append(model.lpretrained_embeddings[lang](input)) - if model.llearnable_embeddings[lang]: - input_list.append(model.llearnable_embeddings[lang](input)) - return torch.cat(tensors=input_list, dim=2) - - -def embedding_dropout(input, drop_range, p_drop=0.5, training=True): - if p_drop > 0 and training and drop_range is not None: - p = p_drop - drop_from, drop_to = drop_range - m = drop_to - drop_from #length of the supervised embedding - l = input.shape[2] #total embedding length - corr = (1 - p) - input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) - input /= (1 - (p * m / l)) - - return input diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py deleted file mode 100755 index 98424f1..0000000 --- a/src/models/lstm_class.py +++ /dev/null @@ -1,114 +0,0 @@ -#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -import torch -import torch.nn as nn -from torch.autograd import Variable -from models.helpers import * - - -class RNNMultilingualClassifier(nn.Module): - - def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, - drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, - bert_embeddings=False): - - super(RNNMultilingualClassifier, self).__init__() - self.output_size = output_size - self.hidden_size = hidden_size - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - self.post_probabilities = post_probabilities - self.bert_embeddings = bert_embeddings - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - - self.lpretrained_embeddings = nn.ModuleDict() - self.llearnable_embeddings = nn.ModuleDict() - self.embedding_length = None - self.langs = sorted(lvocab_size.keys()) - self.only_post = only_post - - self.n_layers = 1 - self.n_directions = 1 - - self.dropout = nn.Dropout(0.6) - - lstm_out = 256 - ff1 = 512 - ff2 = 256 - - lpretrained_embeddings = {} - llearnable_embeddings = {} - if only_post==False: - for l in self.langs: - pretrained = lpretrained[l] if lpretrained else None - pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( - pretrained, lvocab_size[l], learnable_length - ) - lpretrained_embeddings[l] = pretrained_embeddings - llearnable_embeddings[l] = learnable_embeddings - self.embedding_length = embedding_length - - # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) - self.rnn = nn.GRU(self.embedding_length, hidden_size) - self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) - self.lpretrained_embeddings.update(lpretrained_embeddings) - self.llearnable_embeddings.update(llearnable_embeddings) - - self.linear1 = nn.Linear(lstm_out, ff1) - self.linear2 = nn.Linear(ff1, ff2) - - if only_post: - self.label = nn.Linear(output_size, output_size) - elif post_probabilities and not bert_embeddings: - self.label = nn.Linear(ff2 + output_size, output_size) - elif bert_embeddings and not post_probabilities: - self.label = nn.Linear(ff2 + 768, output_size) - elif post_probabilities and bert_embeddings: - self.label = nn.Linear(ff2 + output_size + 768, output_size) - else: - self.label = nn.Linear(ff2, output_size) - - def forward(self, input, post, bert_embed, lang): - if self.only_post: - doc_embedding = post - else: - doc_embedding = self.transform(input, lang) - if self.post_probabilities: - doc_embedding = torch.cat([doc_embedding, post], dim=1) - if self.bert_embeddings: - doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) - - logits = self.label(doc_embedding) - return logits - - def transform(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # output, (_, _) = self.lstm(input, (h_0, c_0)) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - output = self.dropout(F.relu(self.linear2(output))) - return output - - def finetune_pretrained(self): - for l in self.langs: - self.lpretrained_embeddings[l].requires_grad = True - self.lpretrained_embeddings[l].weight.requires_grad = True - - def get_embeddings(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - return output.cpu().detach().numpy() - diff --git a/src/models/mBert.py b/src/models/mBert.py deleted file mode 100644 index 56695a6..0000000 --- a/src/models/mBert.py +++ /dev/null @@ -1,247 +0,0 @@ -from copy import deepcopy -import torch -from torch.utils.data import Dataset -from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig -from sklearn.model_selection import train_test_split -from util.evaluation import * -from time import time -from util.common import show_gpu - - -def predict(logits, classification_type='multilabel'): - if classification_type == 'multilabel': - prediction = torch.sigmoid(logits) > 0.5 - elif classification_type == 'singlelabel': - prediction = torch.argmax(logits, dim=1).view(-1, 1) - else: - print('unknown classification type') - - return prediction.detach().cpu().numpy() - - -class TrainingDataset(Dataset): - - def __init__(self, data, labels): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _labels = labels[lang] - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.labels = _labels - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.labels = np.vstack((self.labels, _labels)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - y = self.labels[idx] - lang = self.lang_index[idx] - - return x, torch.tensor(y, dtype=torch.float), lang - - def get_lang_ids(self): - return self.lang_ids - - def get_nclasses(self): - if hasattr(self, 'labels'): - return len(self.labels[0]) - else: - print('Method called before init!') - - -class ExtractorDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - lang = self.lang_index[idx] - - return x, lang - - def get_lang_ids(self): - return self.lang_ids - - -def get_model(n_out): - print('# Initializing model ...') - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) - return model - - -def init_optimizer(model, lr, weight_decay=0): - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay}, - {'params': [p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - return optimizer - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): - l_split_va = deepcopy(l_tokenized_tr) - l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} - l_split_tr = deepcopy(l_tokenized_tr) - l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} - - for lang in l_tokenized_tr.keys(): - val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) - l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ - lang] = \ - train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, - random_state=seed, shuffle=True) - - return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target - - -def do_tokenization(l_dataset, max_len=512, verbose=True): - if verbose: - print('# Starting Tokenization ...') - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - langs = l_dataset.keys() - l_tokenized = {} - for lang in langs: - l_tokenized[lang] = tokenizer(l_dataset[lang], - truncation=True, - max_length=max_len, - padding='max_length') - return l_tokenized - - -def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): - # _dataset_path = opt.dataset.split('/')[-1].split('_') - # dataset_id = _dataset_path[0] + _dataset_path[-1] - dataset_id = 'TODO fix this!' # TODO - - loss_history = [] - model.train() - - for idx, (batch, target, lang_idx) in enumerate(train_dataloader): - optim.zero_grad() - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()) - loss.backward() - # clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % log_interval == 0: - interval_loss = np.mean(loss_history[log_interval:]) - print( - f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix): - print('# Validating model ...') - loss_history = [] - model.eval() - langs = lang_ids.keys() - id_2_lang = {v: k for k, v in lang_ids.items()} - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - - for batch, target, lang_idx in test_dataloader: - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()).item() - prediction = predict(logits) - loss_history.append(loss) - - # Assigning prediction to dict in predictions and yte_stacked according to lang_idx - for i, pred in enumerate(prediction): - lang_pred = id_2_lang[lang_idx.numpy()[i]] - predictions[lang_pred].append(pred) - yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) - - ly = {l: np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l: np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -def feature_extractor(data, lang_ids, model): - print('# Feature Extractor Mode...') - """ - Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for - the output of each layer) of shape (batch_size, sequence_length, hidden_size) - """ - # show_gpu('Before Training') - all_batch_embeddings = {} - id2lang = {v: k for k, v in lang_ids.items()} - with torch.no_grad(): - for batch, lang_idx in data: - out = model(batch.cuda()) - # show_gpu('After Batch Prediction') - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - # show_gpu('After Full Prediction') - return all_batch_embeddings, id2lang diff --git a/src/results/results_manager.py b/src/results/results_manager.py deleted file mode 100644 index 1fe57dd..0000000 --- a/src/results/results_manager.py +++ /dev/null @@ -1,11 +0,0 @@ -import pandas as pd -import numpy as np - -# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t') -df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t') -pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std]) -with pd.option_context('display.max_rows', None): - print(pivot.round(3)) -print('Finished ...') - - diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py deleted file mode 100644 index cfe096e..0000000 --- a/src/util/SIF_embed.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -from sklearn.decomposition import TruncatedSVD - -def get_weighted_average(We, x, w): - """ - Compute the weighted average vectors - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in sentence i - :param w: w[i, :] are the weights for the words in sentence i - :return: emb[i, :] are the weighted average vector for sentence i - """ - n_samples = x.shape[0] - emb = np.zeros((n_samples, We.shape[1])) - for i in range(n_samples): - emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) - return emb - -def compute_pc(X,npc=1): - """ - Compute the principal components. - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: component_[i,:] is the i-th pc - """ - svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) - svd.fit(X) - return svd.components_ - -def remove_pc(X, npc=1): - """ - Remove the projection on the principal components - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: XX[i, :] is the data point after removing its projection - """ - pc = compute_pc(X, npc) - if npc==1: - XX = X - X.dot(pc.transpose()) * pc - else: - XX = X - X.dot(pc.transpose()).dot(pc) - return XX - - -def SIF_embedding(We, x, w, params): - """ - Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in the i-th sentence - :param w: w[i, :] are the weights for the words in the i-th sentence - :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component - :return: emb, emb[i, :] is the embedding for sentence i - """ - emb = get_weighted_average(We, x, w) - if params.rmpc > 0: - emb = remove_pc(emb, params.rmpc) - return emb \ No newline at end of file diff --git a/src/util/common.py b/src/util/common.py deleted file mode 100755 index 48a0525..0000000 --- a/src/util/common.py +++ /dev/null @@ -1,542 +0,0 @@ -import subprocess -import warnings -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.svm import SVC -from sklearn.model_selection import train_test_split -from embeddings.supervised import get_supervised_embeddings -import numpy as np -from tqdm import tqdm -import torch -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): - """ - Index (i.e., replaces word strings with numerical indexes) a list of string documents - :param data: list of string documents - :param vocab: a fixed mapping [str]->[int] of words to indexes - :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained - because they are anyway contained in a pre-trained embedding set that we know in advance) - :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words - :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep - :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that - are not in the original vocab but that are in the known_words - :return: - """ - indexes=[] - vocabsize = len(vocab) - unk_count = 0 - knw_count = 0 - out_count = 0 - pbar = tqdm(data, desc=f'indexing documents') - for text in pbar: - words = analyzer(text) - index = [] - for word in words: - if word in vocab: - idx = vocab[word] - else: - if word in known_words: - if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) - idx = out_of_vocabulary[word] - out_count += 1 - else: - idx = unk_index - unk_count += 1 - index.append(idx) - indexes.append(index) - knw_count += len(index) - pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' - f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') - return indexes - - -def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths)+np.std(lengths)) - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i,indexes in enumerate(index_list): - index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] - return index_list - - -class Index: - def __init__(self, devel_raw, devel_target, test_raw, lang): - self.lang = lang - self.devel_raw = devel_raw - self.devel_target = devel_target - self.test_raw = test_raw - - def index(self, pretrained_vocabulary, analyzer, vocabulary): - self.word2index = dict(vocabulary) # word2idx - known_words = set(self.word2index.keys()) - if pretrained_vocabulary is not None: - known_words.update(pretrained_vocabulary) - - self.word2index['UNKTOKEN'] = len(self.word2index) - self.word2index['PADTOKEN'] = len(self.word2index) - self.unk_index = self.word2index['UNKTOKEN'] - self.pad_index = self.word2index['PADTOKEN'] - - # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) - self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - - self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) - - print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') - - def train_val_split(self, val_prop, max_val, seed): - devel = self.devel_index - target = self.devel_target - devel_raw = self.devel_raw - - val_size = int(min(len(devel) * val_prop, max_val)) - - self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ - train_test_split( - devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True - ) - - print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') - - def get_word_list(self): - def extract_word_list(word2index): - return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])] - - word_list = extract_word_list(self.word2index) - word_list += extract_word_list(self.out_of_vocabulary) - return word_list - - def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): - print(f'[generating embedding matrix for lang {self.lang}]') - - self.wce_range = None - embedding_parts = [] - - if pretrained is not None: - print('\t[pretrained-matrix]') - word_list = self.get_word_list() - muse_embeddings = pretrained.extract(word_list) - embedding_parts.append(muse_embeddings) - del pretrained - - if supervised: - print('\t[supervised-matrix]') - F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn') - num_missing_rows = self.vocabsize - F.shape[0] - F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) - F = torch.from_numpy(F).float() - - offset = 0 - if embedding_parts: - offset = embedding_parts[0].shape[1] - self.wce_range = [offset, offset + F.shape[1]] - - embedding_parts.append(F) - - make_dumps = False - if make_dumps: - print(f'Dumping Embedding Matrices ...') - import pickle - with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile: - pickle.dump((self.lang, embedding_parts, self.word2index), outfile) - with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2: - pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2) - - self.embedding_matrix = torch.cat(embedding_parts, dim=1) - - print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') - - -def none_dict(langs): - return {l:None for l in langs} - - -class MultilingualIndex: - def __init__(self): #, add_language_trace=False): - self.l_index = {} - self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000) - # self.add_language_trace=add_language_trace} - - def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): - self.langs = sorted(l_devel_raw.keys()) - - #build the vocabularies - self.l_vectorizer.fit(l_devel_raw) - l_vocabulary = self.l_vectorizer.vocabulary() - l_analyzer = self.l_vectorizer.get_analyzer() - - for l in self.langs: - self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l) - self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l]) - - def get_indexed(self, l_texts, pretrained_vocabulary=None): - assert len(self.l_index) != 0, 'Cannot index data without first index call to multilingual index!' - l_indexed = {} - for l, texts in l_texts.items(): - if l in self.langs: - word2index = self.l_index[l].word2index - known_words = set(word2index.keys()) - if pretrained_vocabulary[l] is not None: - known_words.update(pretrained_vocabulary[l]) - l_indexed[l] = index(texts, - vocab=word2index, - known_words=known_words, - analyzer=self.l_vectorizer.get_analyzer(l), - unk_index=word2index['UNKTOKEN'], - out_of_vocabulary=dict()) - return l_indexed - - def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l,index in self.l_index.items(): - index.train_val_split(val_prop, max_val, seed=seed) - - def embedding_matrices(self, lpretrained, supervised): - lXtr = self.get_lXtr() if supervised else none_dict(self.langs) - lYtr = self.l_train_target() if supervised else none_dict(self.langs) - for l,index in self.l_index.items(): - index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) - self.sup_range = index.wce_range - - - def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False): - show_gpu('GPU memory before initializing mBert model:') - # TODO: load dumped embeddings? - from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader - from transformers import BertConfig, BertForSequenceClassification - - print('[mBERT] generating mBERT doc embeddings') - lXtr_raw = self.get_raw_lXtr() - lXva_raw = self.get_raw_lXva() - lXte_raw = self.get_raw_lXte() - - print('# Tokenizing datasets') - l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False) - tr_dataset = ExtractorDataset(l_tokenized_tr) - tr_lang_ids = tr_dataset.lang_ids - tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False) - - l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False) - va_dataset = ExtractorDataset(l_tokenized_va) - va_lang_ids = va_dataset.lang_ids - va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False) - - l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False) - te_dataset = ExtractorDataset(l_tokenized_te) - te_lang_ids = te_dataset.lang_ids - te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False) - - num_labels = self.l_index[self.langs[0]].val_target.shape[1] - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, - num_labels=num_labels) - model = BertForSequenceClassification.from_pretrained(bert_path, - config=config).cuda() - print('# Extracting document embeddings') - tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False) - va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False) - te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False) - - show_gpu('GPU memory before after mBert model:') - # Freeing GPU's memory - import gc - del model, tr_dataloader, va_dataloader, te_dataloader - gc.collect() - torch.cuda.empty_cache() - show_gpu('GPU memory after clearing cache:') - return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings - - - @staticmethod - def do_bert_embeddings(model, data, lang_ids, verbose=True): - if verbose: - print('# Feature Extractor Mode...') - all_batch_embeddings = {} - id2lang = {v: k for k, v in lang_ids.items()} - with torch.no_grad(): - for batch, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang - - def get_raw_lXtr(self): - lXtr_raw = {k:[] for k in self.langs} - lYtr_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXtr_raw[lang] = self.l_index[lang].train_raw - lYtr_raw[lang] = self.l_index[lang].train_raw - return lXtr_raw - - def get_raw_lXva(self): - lXva_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXva_raw[lang] = self.l_index[lang].val_raw - - return lXva_raw - - def get_raw_lXte(self): - lXte_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXte_raw[lang] = self.l_index[lang].test_raw - - return lXte_raw - - def get_lXtr(self): - if not hasattr(self, 'lXtr'): - self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) - return self.lXtr - - def get_lXva(self): - if not hasattr(self, 'lXva'): - self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) - return self.lXva - - def get_lXte(self): - if not hasattr(self, 'lXte'): - self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) - return self.lXte - - def l_vocabsize(self): - return {l:index.vocabsize for l,index in self.l_index.items()} - - def l_embeddings(self): - return {l:index.embedding_matrix for l,index in self.l_index.items()} - - def l_pad(self): - return {l: index.pad_index for l, index in self.l_index.items()} - - def l_train_index(self): - return {l: index.train_index for l, index in self.l_index.items()} - - def l_train_target(self): - return {l: index.train_target for l, index in self.l_index.items()} - - def l_val_index(self): - return {l: index.val_index for l, index in self.l_index.items()} - - def l_val_target(self): - return {l: index.val_target for l, index in self.l_index.items()} - - def l_test_index(self): - return {l: index.test_index for l, index in self.l_index.items()} - - def l_devel_index(self): - return {l: index.devel_index for l, index in self.l_index.items()} - - def l_devel_target(self): - return {l: index.devel_target for l, index in self.l_index.items()} - - def l_train(self): - return self.l_train_index(), self.l_train_target() - - def l_val(self): - return self.l_val_index(), self.l_val_target() - - -class Batch: - def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): - self.batchsize = batchsize - self.batches_per_epoch = batches_per_epoch - self.languages = languages - self.lpad=lpad - self.max_pad_length=max_pad_length - self.init_offset() - - def init_offset(self): - self.offset = {lang: 0 for lang in self.languages} - - def batchify(self, l_index, l_post, l_bert, llabels): - langs = self.languages - l_num_samples = {l:len(l_index[l]) for l in langs} - - max_samples = max(l_num_samples.values()) - n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) - if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: - n_batches = self.batches_per_epoch - - for b in range(n_batches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - offset = self.offset[lang] - if offset >= l_num_samples[lang]: - offset = 0 - limit = offset+self.batchsize - - batch_slice = slice(offset, limit) - batch = index[batch_slice] - batch_labels = labels[batch_slice].toarray() - - post = None - if l_post is not None: - post = torch.FloatTensor(l_post[lang][batch_slice]).cuda() - - bert_emb = None - if l_bert is not None: - bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda() - - batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) - - batch = torch.LongTensor(batch).cuda() - target = torch.FloatTensor(batch_labels).cuda() - - self.offset[lang] = limit - - yield batch, post, bert_emb, target, lang - - -def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500): - langs = sorted(l_index.keys()) - nsamples = max([len(l_index[l]) for l in langs]) - nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) - for b in range(nbatches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - - if b * batchsize >= len(index): - continue - batch = index[b*batchsize:(b+1)*batchsize] - batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray() - post = None - if l_post is not None: - post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda() - batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length) - batch = torch.LongTensor(batch) - target = torch.FloatTensor(batch_labels) - yield batch.cuda(), post, target.cuda(), lang - - -def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500): - nsamples = len(index_list) - nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) - for b in range(nbatches): - batch = index_list[b*batchsize:(b+1)*batchsize] - batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length) - batch = torch.LongTensor(batch) - yield batch.cuda() - - -def clip_gradient(model, clip_value=1e-1): - params = list(filter(lambda p: p.grad is not None, model.parameters())) - for p in params: - p.grad.data.clamp_(-clip_value, clip_value) - - -def predict(logits, classification_type='multilabel'): - if classification_type == 'multilabel': - prediction = torch.sigmoid(logits) > 0.5 - elif classification_type == 'singlelabel': - prediction = torch.argmax(logits, dim=1).view(-1, 1) - else: - print('unknown classification type') - - return prediction.detach().cpu().numpy() - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def show_gpu(msg): - """ - ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4 - """ - - def query(field): - return (subprocess.check_output( - ['nvidia-smi', f'--query-gpu={field}', - '--format=csv,nounits,noheader'], - encoding='utf-8')) - - def to_int(result): - return int(result.strip().split('\n')[0]) - - used = to_int(query('memory.used')) - total = to_int(query('memory.total')) - pct = used / total - print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})') - - -class TfidfVectorizerMultilingual: - - def __init__(self, **kwargs): - self.kwargs = kwargs - - def fit(self, lX, ly=None): - self.langs = sorted(lX.keys()) - self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} - # self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()} - return self - - def transform(self, lX): - return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} - # return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()} - - def fit_transform(self, lX, ly=None): - return self.fit(lX, ly).transform(lX) - - def vocabulary(self, l=None): - if l is None: - return {l: self.vectorizer[l].vocabulary_ for l in self.langs} - else: - return self.vectorizer[l].vocabulary_ - - def get_analyzer(self, l=None): - if l is None: - return {l: self.vectorizer[l].build_analyzer() for l in self.langs} - else: - return self.vectorizer[l].build_analyzer() - - -def get_learner(calibrate=False, kernel='linear', C=1): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) - - -def get_params(optimc=False): - if not optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - - -def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru, - gruMUSE, gruWCE, agg, allprob): - _id = '-' - _id_conf = [posteriors, supervised, pretrained, mbert, gru] - _id_name = ['X', 'W', 'M', 'B', 'G'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id if not gruMUSE else _id + '_muse' - _id = _id if not gruWCE else _id + '_wce' - _id = _id if not agg else _id + '_mean' - _id = _id if not allprob else _id + '_allprob' - - _dataset_path = dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - return _id, dataset_id - - -def get_zscl_setting(langs): - settings = [] - for elem in langs: - for tar in langs: - if elem != tar: - settings.append((elem, tar)) - return settings \ No newline at end of file diff --git a/src/util/csv_log.py b/src/util/csv_log.py deleted file mode 100755 index 8c11e36..0000000 --- a/src/util/csv_log.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import pandas as pd -pd.set_option('display.max_rows', 500) -pd.set_option('display.max_columns', 500) -pd.set_option('display.width', 1000) - - -class CSVLog: - - def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False): - self.file = file - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file) and not overwrite: - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - self.columns = sorted(self.df.columns.values.tolist()) - else: - self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file)) - assert columns is not None, 'columns cannot be None' - self.columns = sorted(columns) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - self.defaults={} - - def already_calculated(self, **kwargs): - df = self.df - if df.shape[0]==0: - return False - if len(kwargs)==0: - kwargs = self.defaults - for key,val in kwargs.items(): - df = df.loc[df[key]==val] - if df.shape[0]==0: return False - return True - - def set_default(self, param, value): - self.defaults[param]=value - - def add_row(self, **kwargs): - for key in self.defaults.keys(): - if key not in kwargs: - kwargs[key]=self.defaults[key] - colums = sorted(list(kwargs.keys())) - values = [kwargs[col_i] for col_i in colums] - s = pd.Series(values, index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - # self.tell(s.to_string()) - self.tell(kwargs) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) - - - diff --git a/src/util/decompositions.py b/src/util/decompositions.py deleted file mode 100644 index 9d14a0c..0000000 --- a/src/util/decompositions.py +++ /dev/null @@ -1,50 +0,0 @@ -from sklearn.decomposition import PCA -import numpy as np -import matplotlib.pyplot as plt - - -def run_pca(dim, X): - """ - :param dim: number of pca components to keep - :param X: dictionary str(lang): matrix - :return: dict lang: reduced matrix - """ - r = dict() - pca = PCA(n_components=dim) - for lang in X.keys(): - r[lang] = pca.fit_transform(X[lang]) - return r - - -def get_optimal_dim(X, embed_type): - """ - :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised - :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT) - :return: - """ - _idx = [] - - plt.figure(figsize=(15, 10)) - if embed_type == 'U': - plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance') - else: - plt.title(f'WCE Explained Variance') - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') - - for lang in X.keys(): - pca = PCA(n_components=X[lang].shape[1]) - pca.fit(X[lang]) - _r = pca.explained_variance_ratio_ - _r = np.cumsum(_r) - plt.plot(_r, label=lang) - for i in range(len(_r) - 1, 1, -1): - delta = _r[i] - _r[i - 1] - if delta > 0: - _idx.append(i) - break - best_n = max(_idx) - plt.axvline(best_n, color='r', label='optimal N') - plt.legend() - plt.show() - return best_n diff --git a/src/util/early_stop.py b/src/util/early_stop.py deleted file mode 100755 index 7d72cde..0000000 --- a/src/util/early_stop.py +++ /dev/null @@ -1,71 +0,0 @@ -#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py -import torch -from transformers import BertForSequenceClassification -from time import time -from util.file import create_if_not_exist -import warnings - -class EarlyStopping: - - def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False): - # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters - self.patience_limit = patience - self.patience = patience - self.verbose = verbose - self.best_score = None - self.best_epoch = None - self.stop_time = None - self.checkpoint = checkpoint - self.model = model - self.optimizer = optimizer - self.STOP = False - self.is_bert = is_bert - - def __call__(self, watch_score, epoch): - - if self.STOP: - return - - if self.best_score is None or watch_score >= self.best_score: - self.best_score = watch_score - self.best_epoch = epoch - self.stop_time = time() - if self.checkpoint: - self.print(f'[early-stop] improved, saving model in {self.checkpoint}') - if self.is_bert: - print(f'Serializing Huggingface model...') - create_if_not_exist(self.checkpoint) - self.model.save_pretrained(self.checkpoint) - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - torch.save(self.model, self.checkpoint) - # with open(self.checkpoint) - # torch.save({'state_dict': self.model.state_dict(), - # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) - else: - self.print(f'[early-stop] improved') - self.patience = self.patience_limit - else: - self.patience -= 1 - if self.patience == 0: - self.STOP = True - self.print(f'[early-stop] patience exhausted') - else: - if self.patience>0: # if negative, then early-stop is ignored - self.print(f'[early-stop] patience={self.patience}') - - def reinit_counter(self): - self.STOP = False - self.patience=self.patience_limit - - def restore_checkpoint(self): - print(f'restoring best model from epoch {self.best_epoch}...') - if self.is_bert: - return BertForSequenceClassification.from_pretrained(self.checkpoint) - else: - return torch.load(self.checkpoint) - - def print(self, msg): - if self.verbose: - print(msg) diff --git a/src/util/evaluation.py b/src/util/evaluation.py deleted file mode 100644 index 41a2813..0000000 --- a/src/util/evaluation.py +++ /dev/null @@ -1,102 +0,0 @@ -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed -from util.metrics import * -from sklearn.metrics import f1_score -import numpy as np -import time - - -def evaluation_metrics(y, y_): - if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label - raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers - return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) - - -def soft_evaluation_metrics(y, y_): - if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label - raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers - return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_) - - -def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): - print('evaluation (n_jobs={})'.format(n_jobs)) - if n_jobs == 1: - return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} - else: - langs = list(ly_true.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} - - -def average_results(l_eval, show=True): - metrics = [] - for lang in l_eval.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if show: - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - - ave = np.mean(np.array(metrics), axis=0) - if show: - print('Averages: MF1, mF1, MK, mK', ave) - return ave - - -def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False): - tinit = time.time() - print('prediction for test') - assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1 - - if predictor is None: - predictor = polylingual_method.predict - - metrics = evaluation_metrics - if soft is True: - metrics = soft_evaluation_metrics - ly_ = predictor(lX, ly) - - eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs) - if return_time: - return eval_, time.time()-tinit - else: - return eval_ - - -def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False): - print('prediction for test in a single language') - if predictor is None: - predictor = polylingual_method.predict - - metrics = evaluation_metrics - if soft is True: - metrics = soft_evaluation_metrics - - ly_ = predictor({lang:X}) - return metrics(y, ly_[lang]) - - -def get_binary_counters(polylingual_method, lX, ly, predictor=None): - print('prediction for test') - assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs - if predictor is None: - predictor = polylingual_method.predict - ly_ = predictor(lX) - print('evaluation (n_jobs={})'.format(n_jobs)) - if n_jobs == 1: - return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()} - else: - langs = list(ly.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} - - -def binary_counters(y, y_): - y = np.reshape(y, (-1)) - assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected' - counters = hard_single_metric_statistics(y, y_) - return counters.tp, counters.tn, counters.fp, counters.fn - diff --git a/src/util/file.py b/src/util/file.py deleted file mode 100644 index a3d0a3a..0000000 --- a/src/util/file.py +++ /dev/null @@ -1,44 +0,0 @@ -from os import listdir, makedirs -from os.path import isdir, isfile, join, exists, dirname -#from sklearn.externals.six.moves import urllib -import urllib -from pathlib import Path - - -def download_file(url, archive_filename): - def progress(blocknum, bs, size): - total_sz_mb = '%.2f MB' % (size / 1e6) - current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) - print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') - print("Downloading %s" % url) - urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) - print("") - -def download_file_if_not_exists(url, archive_path): - if exists(archive_path): return - makedirs_if_not_exist(dirname(archive_path)) - download_file(url,archive_path) - - -def ls(dir, typecheck): - el = [f for f in listdir(dir) if typecheck(join(dir, f))] - el.sort() - return el - -def list_dirs(dir): - return ls(dir, typecheck=isdir) - -def list_files(dir): - return ls(dir, typecheck=isfile) - -def makedirs_if_not_exist(path): - if not exists(path): makedirs(path) - -def create_if_not_exist(path): - if not exists(path): makedirs(path) - -def get_parent_name(path): - return Path(path).parent - -def get_file_name(path): - return Path(path).name diff --git a/src/util/metrics.py b/src/util/metrics.py deleted file mode 100644 index ca688b7..0000000 --- a/src/util/metrics.py +++ /dev/null @@ -1,255 +0,0 @@ -import numpy as np -from scipy.sparse import lil_matrix, issparse -from sklearn.metrics import f1_score, accuracy_score - - - -""" -Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. -I.e., when the number of true positives, false positives, and false negatives ammount to 0, all -affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. -We adhere to the common practice of outputting 1 in this case since the classifier has correctly -classified all examples as negatives. -""" - -class ContTable: - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - def __add__(self, other): - return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) - -def accuracy(cell): - return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) - -def f1(cell): - num = 2.0 * cell.tp - den = 2.0 * cell.tp + cell.fp + cell.fn - if den>0: return num / den - #we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative - return 1.0 - -def K(cell): - specificity, recall = 0., 0. - - AN = cell.tn + cell.fp - if AN != 0: - specificity = cell.tn*1. / AN - - AP = cell.tp + cell.fn - if AP != 0: - recall = cell.tp*1. / AP - - if AP == 0: - return 2. * specificity - 1. - elif AN == 0: - return 2. * recall - 1. - else: - return specificity + recall - 1. - -#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions -#true_labels and predicted_labels are two vectors of shape (number_documents,) -def hard_single_metric_statistics(true_labels, predicted_labels): - assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels." - nd = len(true_labels) - tp = np.sum(predicted_labels[true_labels==1]) - fp = np.sum(predicted_labels[true_labels == 0]) - fn = np.sum(true_labels[predicted_labels == 0]) - tn = nd - (tp+fp+fn) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - -#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir -# probabilitiesfron with respect to the true binary labels -#true_labels and posterior_probabilities are two vectors of shape (number_documents,) -def soft_single_metric_statistics(true_labels, posterior_probabilities): - assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels." - tp = np.sum(posterior_probabilities[true_labels == 1]) - fn = np.sum(1. - posterior_probabilities[true_labels == 1]) - fp = np.sum(posterior_probabilities[true_labels == 0]) - tn = np.sum(1. - posterior_probabilities[true_labels == 0]) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - -#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared -#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. -def __check_consistency_and_adapt(true_labels, predictions): - if predictions.ndim == 1: - return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) - if true_labels.ndim == 1: - return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions) - if true_labels.shape != predictions.shape: - raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." - % (true_labels.shape, predictions.shape)) - _,nC = true_labels.shape - return true_labels, predictions, nC - -def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) - -def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - - accum = ContTable() - for c in range(nC): - other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) - accum = accum + other - - return metric(accum) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroF1(true_labels, predicted_labels): - return macro_average(true_labels,predicted_labels, f1) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microF1(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, f1) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroK(true_labels, predicted_labels): - return macro_average(true_labels,predicted_labels, K) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microK(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, K) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmacroF1(true_labels, posterior_probabilities): - return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmicroF1(true_labels, posterior_probabilities): - return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmacroK(true_labels, posterior_probabilities): - return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmicroK(true_labels, posterior_probabilities): - return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) - - - - -""" -Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. -I.e., when the number of true positives, false positives, and false negatives ammount to 0, all -affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. -We adhere to the common practice of outputting 1 in this case since the classifier has correctly -classified all examples as negatives. -""" - -def evaluation(y_true, y_pred, classification_type): - - if classification_type == 'multilabel': - eval_function = multilabel_eval - elif classification_type == 'singlelabel': - eval_function = singlelabel_eval - - Mf1, mf1, accuracy = eval_function(y_true, y_pred) - - return Mf1, mf1, accuracy - - -def multilabel_eval(y, y_): - - tp = y.multiply(y_) - - fn = lil_matrix(y.shape) - true_ones = y==1 - fn[true_ones]=1-tp[true_ones] - - fp = lil_matrix(y.shape) - pred_ones = y_==1 - if pred_ones.nnz>0: - fp[pred_ones]=1-tp[pred_ones] - - #macro-f1 - tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten() - fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten() - fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten() - - pos_pred = tp_macro+fp_macro - pos_true = tp_macro+fn_macro - prec=np.zeros(shape=tp_macro.shape,dtype=float) - rec=np.zeros(shape=tp_macro.shape,dtype=float) - np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0) - np.divide(tp_macro, pos_true, out=rec, where=pos_true>0) - den=prec+rec - - macrof1=np.zeros(shape=tp_macro.shape,dtype=float) - np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0) - macrof1 *=2 - - macrof1[(pos_pred==0)*(pos_true==0)]=1 - macrof1 = np.mean(macrof1) - - #micro-f1 - tp_micro = tp_macro.sum() - fn_micro = fn_macro.sum() - fp_micro = fp_macro.sum() - pos_pred = tp_micro + fp_micro - pos_true = tp_micro + fn_micro - prec = (tp_micro / pos_pred) if pos_pred>0 else 0 - rec = (tp_micro / pos_true) if pos_true>0 else 0 - den = prec+rec - microf1 = 2*prec*rec/den if den>0 else 0 - if pos_pred==pos_true==0: - microf1=1 - - #accuracy - ndecisions = np.multiply(*y.shape) - tn = ndecisions - (tp_micro+fn_micro+fp_micro) - acc = (tp_micro+tn)/ndecisions - - return macrof1,microf1,acc - - -def singlelabel_eval(y, y_): - if issparse(y_): y_ = y_.toarray().flatten() - macrof1 = f1_score(y, y_, average='macro') - microf1 = f1_score(y, y_, average='micro') - acc = accuracy_score(y, y_) - return macrof1,microf1,acc - diff --git a/src/util/parser_options.py b/src/util/parser_options.py deleted file mode 100644 index 14d827c..0000000 --- a/src/util/parser_options.py +++ /dev/null @@ -1,94 +0,0 @@ -from optparse import OptionParser - -parser = OptionParser(usage="usage: %prog datapath [options]") - -parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='../log/multiModal_log.csv') - -parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-W", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("-B", "--mbert", dest="mbert", action='store_true', - help="Add multilingual Bert (mBert) document embedding representation", default=False) - -parser.add_option('-G', dest='gruViewGenerator', action='store_true', - help="Add document embedding generated via recurrent net (GRU)", default=False) - -parser.add_option("--l2", dest="l2", action='store_true', - help="Activates l2 normalization as a post-processing for the document embedding views", - default=True) - -parser.add_option("--allprob", dest="allprob", action='store_true', - help="All views are generated as posterior probabilities. This affects the supervised and pretrained" - "embeddings, for which a calibrated classifier is generated, which generates the posteriors", - default=True) - -parser.add_option("--feat-weight", dest="feat_weight", - help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c", type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=True) - -parser.add_option("-z", "--zscore", dest="zscore", action='store_true', - help="Z-score normalize matrices (WCE and MUSE)", default=True) - -parser.add_option("-a", "--agg", dest="agg", action='store_true', - help="Set aggregation function of the common Z-space to average (Default: concatenation)", - default=True) - -parser.add_option("-l", dest="avoid_loading", action="store_true", - help="TODO", default=False) - -# ------------------------------------------------------------------------------------ - -parser.add_option('--hidden', type=int, default=512, metavar='int', - help='hidden lstm size (default: 512)') - -parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', - help='dropout probability for the supervised matrix (default: 0.5)') - -parser.add_option('--tunable', action='store_true', default=False, - help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') - -parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv') - -parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - -parser.add_option('--force', action='store_true', default=False, - help='do not check if this experiment has already been run') - -parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False, - help='Deploy MUSE embedding as embedding layer of the GRU View Generator') - -parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False, - help='Deploy WCE embedding as embedding layer of the GRU View Generator') - -parser.add_option('--gru-path', dest='gru_path', default=None, - help='Set the path to a pretrained GRU model (aka, -G view generator)') - -parser.add_option('--bert-path', dest='bert_path', default=None, - help='Set the path to a pretrained mBERT model (aka, -B view generator)') diff --git a/src/util/results.py b/src/util/results.py deleted file mode 100644 index 6526303..0000000 --- a/src/util/results.py +++ /dev/null @@ -1,92 +0,0 @@ -import os -import pandas as pd -import numpy as np - -class PolylingualClassificationResults: - def __init__(self, file, autoflush=True, verbose=False): - self.file = file - self.columns = ['method', - 'learner', - 'optimp', - 'sif', - 'zscore', - 'l2', - 'wescaler', - 'pca', - 'id', - 'dataset', - 'time', - 'lang', - 'macrof1', - 'microf1', - 'macrok', - 'microk', - 'notes'] - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file): - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - else: - self.tell('File {} does not exist. Creating new frame.'.format(file)) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - - def already_calculated(self, id): - return (self.df['id'] == id).any() - - def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - self.tell(s.to_string()) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) - - -class ZSCLResults: - def __init__(self, file, autoflush=True, verbose=False): - self.file = file - self.columns = ['method', - 'optimp', - 'source', - 'target', - 'id', - 'dataset', - 'time', - 'lang', - 'macrof1', - 'microf1', - 'macrok', - 'microk', - 'notes'] - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file): - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - else: - self.tell('File {} does not exist. Creating new frame.'.format(file)) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - - def already_calculated(self, id): - return (self.df['id'] == id).any() - - def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - self.tell(s.to_string()) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) diff --git a/src/util/util.py b/src/util/util.py deleted file mode 100644 index 823c82d..0000000 --- a/src/util/util.py +++ /dev/null @@ -1,29 +0,0 @@ -from sklearn.svm import SVC -from tqdm import tqdm -import re -import sys - - -def mask_numbers(data, number_mask='numbermask'): - mask = re.compile(r'\b[0-9][0-9.,-]*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - masked.append(mask.sub(number_mask, text)) - return masked - - -def fill_missing_classes(lXtr, lytr): - pass - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - diff --git a/src/util_transformers/StandardizeTransformer.py b/src/util_transformers/StandardizeTransformer.py deleted file mode 100644 index e1a10cf..0000000 --- a/src/util_transformers/StandardizeTransformer.py +++ /dev/null @@ -1,32 +0,0 @@ -import numpy as np - - -class StandardizeTransformer: - - def __init__(self, axis=0, range=None): - assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' - self.axis = axis - self.yetfit = False - self.range = range - - def fit(self, X): - print('fitting Standardizer...') - std=np.std(X, axis=self.axis, ddof=1) - self.std = np.clip(std, 1e-5, None) - self.mean = np.mean(X, axis=self.axis) - if self.range is not None: - ones = np.ones_like(self.std) - zeros = np.zeros_like(self.mean) - ones[self.range] = self.std[self.range] - zeros[self.range] = self.mean[self.range] - self.std = ones - self.mean = zeros - self.yetfit=True - return self - - def transform(self, X): - if not self.yetfit: 'transform called before fit' - return (X - self.mean) / self.std - - def fit_transform(self, X): - return self.fit(X).transform(X) diff --git a/src/util_transformers/__init__.py b/src/util_transformers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/util_transformers/clesa.py b/src/util_transformers/clesa.py deleted file mode 100644 index da17393..0000000 --- a/src/util_transformers/clesa.py +++ /dev/null @@ -1,110 +0,0 @@ -import numpy as np -import sklearn -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed - -class ESA(object): - """ - Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer - """ - supported_similarity = ['dot', 'cosine'] - - def __init__(self, similarity='dot', centered=False, post=None): - """ - :param similarity: the similarity measure between documents to be used - :param centered: set to True to subtract the expected similarity due to randomness (experimental) - :param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default) - """ - assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity) - self.similarity = similarity - self.centered = centered - self.post_processing = post - self.W = None - - def fit(self, W): - """ - :param W: doc-by-term already processed matrix of wikipedia documents - :return: self - """ - self.W = W - return self - - def transform(self, X): - """ - :param X: doc-by-term matrix that is to be transformed into the ESA space. - :return: the matrix X transformed into the ESA space in numpy format - """ - assert self.W is not None, 'transform method called before fit' - - W = self.W - assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape))) - - if self.similarity in ['dot', 'cosine']: - if self.similarity == 'cosine': - X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True) - W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True) - - esa = (X.dot(W.T)).toarray() - if self.centered: - pX = (X > 0).sum(1) / float(X.shape[1]) - pW = (W > 0).sum(1) / float(W.shape[1]) - pXpW = np.sqrt(pX.dot(pW.transpose())) - esa = esa - pXpW - - if self.post_processing: - esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True) - - return esa - - def fit_transform(self, W, X, Y=None): - self.fit(W) - return self.transform(X, Y) - - def dimensionality(self): - return self.W.shape[0] - - - -class CLESA(ESA): - """ - Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer - """ - - def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1): - super(CLESA, self).__init__(similarity, centered, post) - self.lESA = None - self.langs = None - self.n_jobs = n_jobs - - def fit(self, lW): - """ - :param lW: a dictionary of {language: doc-by-term wiki matrix} - :return: self - """ - assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages" - - self.dimensions = list(lW.values())[0].shape[0] - self.langs = list(lW.keys()) - self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs} - return self - - def transform(self, lX): - """ - :param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space - :return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions - """ - assert self.lESA is not None, 'transform method called before fit' - assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope' - langs = list(lX.keys()) - trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs) - return {lang:trans[i] for i,lang in enumerate(langs)} - - def fit_transform(self, lW, lX): - return self.fit(lW).transform(lX) - - def languages(self): - return list(self.lESA.keys()) - - - - diff --git a/src/util_transformers/dci.py b/src/util_transformers/dci.py deleted file mode 100644 index 6e84ed9..0000000 --- a/src/util_transformers/dci.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -from sklearn.preprocessing import normalize -from scipy.sparse import csr_matrix, issparse -from scipy.spatial.distance import cosine -import operator -import functools -import math, sys -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed - - -class DistributionalCorrespondenceIndexing: - - prob_dcf = ['linear', 'pmi'] - vect_dcf = ['cosine'] - valid_dcf = prob_dcf + vect_dcf - valid_post = ['normal', 'l2', None] - - def __init__(self, dcf='cosine', post='normal', n_jobs=-1): - """ - :param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures - the distribucional correspondence between vectors u and v - :param post: post-processing function to apply to document embeddings. Default is to standardize it into a - normal distribution; other functions allowed are 'l2' or None - """ - if post not in self.valid_post: - raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post)) - - if isinstance(dcf, str): - if dcf not in self.valid_dcf: - raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf)) - self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf) - elif hasattr(dcf, '__call__'): - self.dcf = dcf - else: - raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors') - #self.dcf = lambda u,v:dcf(u,v) - self.post = post - self.domains = None - self.dFP = None - self.n_jobs = n_jobs - - def fit(self, dU, dP): - """ - :param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the - distributional semantic model for a specific domain - :param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain, - and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the - number of pivots - :return: self - """ - self.domains = list(dP.keys()) - assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains" - assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP" - assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \ - "inconsistent dimensions between distributional and pivot spaces" - self.dimensions = list(dP.values())[0].shape[1] - # embed the feature space from each domain using the pivots of that domain - #self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains} - transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains) - self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)} - - def _dom_transform(self, X, FP): - _X = X.dot(FP) - if self.post == 'l2': - _X = normalize(_X, norm='l2', axis=1) - elif self.post == 'normal': - std = np.clip(np.std(_X, axis=0), 1e-5, None) - _X = (_X - np.mean(_X, axis=0)) / std - return _X - - # dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix - def transform(self, dX): - assert self.dFP is not None, 'transform method called before fit' - assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope' - domains = list(dX.keys()) - transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains) - return {d: transformations[i] for i, d in enumerate(domains)} - - def fit_transform(self, dU, dP, dX): - return self.fit(dU, dP).transform(dX) - - def _prevalence(self, v): - if issparse(v): - return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank - elif isinstance(v, np.ndarray): - return float(v[v>0].size) / v.size - - def linear(self, u, v, D): - tp, fp, fn, tn = self._get_4cellcounters(u, v, D) - den1=tp+fn - den2=tn+fp - tpr = (tp*1./den1) if den1!=0 else 0. - tnr = (tn*1./den2) if den2!=0 else 0. - return tpr + tnr - 1 - - def pmi(self, u, v, D): - tp, fp, fn, tn = self._get_4cellcounters(u, v, D) - - Pxy = tp * 1. / D - Pxny = fp * 1. / D - Pnxy = fn * 1. / D - Px = Pxy + Pxny - Py = Pxy + Pnxy - - if (Px == 0 or Py == 0 or Pxy == 0): - return 0.0 - - score = math.log2(Pxy / (Px * Py)) - if np.isnan(score) or np.isinf(score): - print('NAN') - sys.exit() - return score - - def cosine(self, u, v): - pu = self._prevalence(u) - pv = self._prevalence(v) - return cosine(u, v) - np.sqrt(pu * pv) - - def _get_4cellcounters(self, u, v, D): - """ - :param u: a set of indexes with a non-zero value - :param v: a set of indexes with a non-zero value - :param D: the number of events (i.e., all posible indexes) - :return: the 4-cell contingency values tp, fp, fn, tn) - """ - common=u.intersection(v) - tp = len(common) - fp = len(u) - len(common) - fn = len(v) - len(common) - tn = D - (tp + fp + fn) - return tp, fp, fn, tn - - def dcf_dist(self, U, V): - nU,D = U.shape - nV = V.shape[0] - if issparse(U): U = U.toarray() - if issparse(V): V = V.toarray() - - dists = np.zeros((nU, nV)) - if self.dcf.__name__ in self.prob_dcf: - def hits_index(v): - return set(np.argwhere(v>0).reshape(-1).tolist()) - Vhits = {i:hits_index(V[i]) for i in range(nV)} - for i in range(nU): - Ui_hits = hits_index(U[i]) - for j in range(nV): - dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D) - else: - for i in range(nU): - for j in range(nV): - dists[i, j] = self.dcf(self, U[i], V[j]) - return dists - diff --git a/src/util_transformers/riboc.py b/src/util_transformers/riboc.py deleted file mode 100644 index 7dfbc42..0000000 --- a/src/util_transformers/riboc.py +++ /dev/null @@ -1,53 +0,0 @@ -import math -import numpy as np -from scipy.sparse import csr_matrix, issparse - -class RandomIndexingBoC(object): - - def __init__(self, latent_dimensions, non_zeros=2): - self.latent_dimensions = latent_dimensions - self.k = non_zeros - self.ri_dict = None - - def fit_transform(self, X): - return self.fit(X).transform(X) - - def fit(self, X): - nF = X.shape[1] - nL = self.latent_dimensions - format = 'csr' if issparse(X) else 'np' - self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format) - return self - - def transform(self, X): - assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary' - if self.ri_dict is None: - raise ValueError("Error: transform method called before fit.") - P = X.dot(self.ri_dict) - if issparse(P): - P.sort_indices() - return P - - -def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False): - assert format in ['csr', 'np'], 'Format should be in "[csr, np]"' - nF, latent_dimensions = shape - print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions)) - val = 1.0 if not normalized else 1.0/math.sqrt(k) - #ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions)) - ri_dict = np.zeros((nF, latent_dimensions)) - - #TODO: optimize - for t in range(nF): - dims = np.zeros(k, dtype=np.int32) - dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps) - dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False) - values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k) - ri_dict[t,dims]=values - print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='') - print('\nDone') - - if format=='csr': - ri_dict = csr_matrix(ri_dict) - return ri_dict - From 3c0ec9e269d6cac397ff323c3ef9828a49bbd40b Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 19 Jan 2021 10:36:07 +0100 Subject: [PATCH 07/55] refactor --- test.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..3fbc4f8 --- /dev/null +++ b/test.py @@ -0,0 +1 @@ +# preparing refactor From bfcd97d1c683f63c89e02b3409d9148402bec6aa Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 19 Jan 2021 10:45:30 +0100 Subject: [PATCH 08/55] refactor --- refactor/data/__init__.py | 0 refactor/data/datamodule.py | 178 ++++++ refactor/data/dataset_builder.py | 710 +++++++++++++++++++++++ refactor/data/languages.py | 42 ++ refactor/data/reader/__init__.py | 0 refactor/data/reader/jrcacquis_reader.py | 321 ++++++++++ refactor/data/reader/rcv_reader.py | 225 +++++++ refactor/data/reader/wikipedia_tools.py | 304 ++++++++++ refactor/data/text_preprocessor.py | 33 ++ refactor/data/tsr_function__.py | 270 +++++++++ refactor/debug_notebook.ipynb | 36 ++ refactor/devel_ideas.py | 95 +++ refactor/main.py | 47 ++ refactor/models/helpers.py | 47 ++ refactor/models/learners.py | 185 ++++++ refactor/models/lstm_class.py | 114 ++++ refactor/models/pl_bert.py | 64 ++ refactor/models/pl_gru.py | 312 ++++++++++ refactor/run.sh | 6 + refactor/util/SIF_embed.py | 56 ++ refactor/util/common.py | 322 ++++++++++ refactor/util/embeddings_manager.py | 102 ++++ refactor/util/evaluation.py | 19 + refactor/util/file.py | 44 ++ refactor/util/metrics.py | 152 +++++ refactor/view_generators.py | 258 ++++++++ test.py | 1 - 27 files changed, 3942 insertions(+), 1 deletion(-) create mode 100644 refactor/data/__init__.py create mode 100644 refactor/data/datamodule.py create mode 100644 refactor/data/dataset_builder.py create mode 100644 refactor/data/languages.py create mode 100644 refactor/data/reader/__init__.py create mode 100644 refactor/data/reader/jrcacquis_reader.py create mode 100644 refactor/data/reader/rcv_reader.py create mode 100644 refactor/data/reader/wikipedia_tools.py create mode 100644 refactor/data/text_preprocessor.py create mode 100755 refactor/data/tsr_function__.py create mode 100644 refactor/debug_notebook.ipynb create mode 100644 refactor/devel_ideas.py create mode 100644 refactor/main.py create mode 100755 refactor/models/helpers.py create mode 100644 refactor/models/learners.py create mode 100755 refactor/models/lstm_class.py create mode 100644 refactor/models/pl_bert.py create mode 100644 refactor/models/pl_gru.py create mode 100644 refactor/run.sh create mode 100644 refactor/util/SIF_embed.py create mode 100644 refactor/util/common.py create mode 100644 refactor/util/embeddings_manager.py create mode 100644 refactor/util/evaluation.py create mode 100644 refactor/util/file.py create mode 100644 refactor/util/metrics.py create mode 100644 refactor/view_generators.py delete mode 100644 test.py diff --git a/refactor/data/__init__.py b/refactor/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py new file mode 100644 index 0000000..bbb7cc1 --- /dev/null +++ b/refactor/data/datamodule.py @@ -0,0 +1,178 @@ +import torch +from torch.utils.data import Dataset, DataLoader +import numpy as np +import pytorch_lightning as pl +from transformers import BertTokenizer + +N_WORKERS = 8 + + +class RecurrentDataset(Dataset): + def __init__(self, lX, ly, lPad_index): + """ + :param lX: dict {lang_id : np.ndarray} + :param ly: + """ + self.lX = [] + self.ly = [] + self.lOffset = {} + self.lPad_index = lPad_index + + for lang, data in lX.items(): + offset = [len(self.lX)] + self.lX.extend(data) + offset.append(len(self.lX)) + self.lOffset[lang] = offset + + for lang, target in ly.items(): + self.ly.extend(target) + + def __len__(self): + return len(self.lX) + + def __getitem__(self, index): + X = self.lX[index] + y = self.ly[index] + return X, y, index, self._get_lang(index) + + def _get_lang(self, index): + for lang, l_range in self.lOffset.items(): + if index in range(l_range[0], l_range[1]): + return lang + + def collate_fn(self, data): + """ + Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch} + items sampled from the Dataset class. + :param data: + :return: + """ + lX_batch = {} + ly_batch = {} + current_lang = data[0][-1] + for d in data: + if d[-1] == current_lang: + if current_lang not in lX_batch.keys(): + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + else: + current_lang = d[-1] + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + + for lang in lX_batch.keys(): + # TODO: double check padding function (too many left pad tokens?) + lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], max_pad_length=70) + # max_pad_length=self.define_pad_length(lX_batch[lang])) + lX_batch[lang] = torch.LongTensor(lX_batch[lang]) + ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) + + return lX_batch, ly_batch + + @staticmethod + def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + @staticmethod + def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list + + +class GfunDataModule(pl.LightningDataModule): + def __init__(self, multilingualIndex, batchsize=64): + """ + Pytorch-lightning DataModule: https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + :param multilingualIndex: + :param batchsize: + """ + self.multilingualIndex = multilingualIndex + self.batchsize = batchsize + super().__init__() + + def prepare_data(self, *args, **kwargs): + pass + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == 'fit' or stage is None: + l_train_index, l_train_target = self.multilingualIndex.l_train() + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + l_val_index, l_val_target = self.multilingualIndex.l_val() + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + # Assign test dataset for use in dataloader(s) + if stage == 'test' or stage is None: + l_test_index, l_test_target = self.multilingualIndex.l_val() + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, + lPad_index=self.multilingualIndex.l_pad()) + + def train_dataloader(self): + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.training_dataset.collate_fn) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.val_dataset.collate_fn) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.test_dataset.collate_fn) + + +class BertDataModule(GfunDataModule): + def __init__(self, multilingualIndex, batchsize=64, max_len=512): + super().__init__(multilingualIndex, batchsize) + self.max_len = max_len + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == 'fit' or stage is None: + l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() + l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() + l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + # Assign test dataset for use in dataloader(s) + # TODO + if stage == 'test' or stage is None: + l_val_raw, l_val_target = self.multilingualIndex.l_test_raw() + l_val_index = self.tokenize(l_val_raw) + self.test_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + + @staticmethod + def tokenize(l_raw, max_len): + # TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + l_tokenized = {} + for lang in l_raw.keys(): + output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') + l_tokenized[lang] = output_tokenizer['input_ids'] + return l_tokenized + + def train_dataloader(self): + """ + NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" + :return: + """ + return DataLoader(self.training_dataset, batch_size=self.batchsize) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize) diff --git a/refactor/data/dataset_builder.py b/refactor/data/dataset_builder.py new file mode 100644 index 0000000..b9650c7 --- /dev/null +++ b/refactor/data/dataset_builder.py @@ -0,0 +1,710 @@ +from os.path import join, exists +from nltk.corpus import stopwords +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.preprocessing import MultiLabelBinarizer +from data.reader.jrcacquis_reader import * +from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy +from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents +import pickle +import numpy as np +from sklearn.model_selection import train_test_split +from scipy.sparse import issparse +import itertools +from tqdm import tqdm +import re +from scipy.sparse import csr_matrix + + +class MultilingualDataset: + """ + A multilingual dataset is a dictionary of training and test documents indexed by language code. + Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the + documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the + labels of each document, and ids is a list of document-identifiers from the original collection. + """ + + def __init__(self): + self.dataset_name = "" + self.multiling_dataset = {} + + def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): + self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) + + def save(self, file): + self.sort_indexes() + pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) + return self + + def __getitem__(self, item): + if item in self.langs(): + return self.multiling_dataset[item] + return None + + @classmethod + def load(cls, file): + data = pickle.load(open(file, 'rb')) + data.sort_indexes() + return data + + @classmethod + def load_ids(cls, file): + data = pickle.load(open(file, 'rb')) + tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} + te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} + return tr_ids, te_ids + + def sort_indexes(self): + for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): + if issparse(Xtr): Xtr.sort_indices() + if issparse(Xte): Xte.sort_indices() + + def set_view(self, categories=None, languages=None): + if categories is not None: + if isinstance(categories, int): + categories = np.array([categories]) + elif isinstance(categories, list): + categories = np.array(categories) + self.categories_view = categories + if languages is not None: + self.languages_view = languages + + def training(self, mask_numbers=False, target_as_csr=False): + return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) + + def test(self, mask_numbers=False, target_as_csr=False): + return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) + + def lXtr(self, mask_numbers=False): + proc = lambda x:_mask_numbers(x) if mask_numbers else x + # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + + def lXte(self, mask_numbers=False): + proc = lambda x: _mask_numbers(x) if mask_numbers else x + # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} + + def lYtr(self, as_csr=False): + lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY + + def lYte(self, as_csr=False): + lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY + + def cat_view(self, Y): + if hasattr(self, 'categories_view'): + return Y[:,self.categories_view] + else: + return Y + + def langs(self): + if hasattr(self, 'languages_view'): + langs = self.languages_view + else: + langs = sorted(self.multiling_dataset.keys()) + return langs + + def num_categories(self): + return self.lYtr()[self.langs()[0]].shape[1] + + def show_dimensions(self): + def shape(X): + return X.shape if hasattr(X, 'shape') else len(X) + for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): + if lang not in self.langs(): continue + print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) + + def show_category_prevalences(self): + nC = self.num_categories() + accum_tr = np.zeros(nC, dtype=np.int) + accum_te = np.zeros(nC, dtype=np.int) + in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) + for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): + if lang not in self.langs(): continue + prev_train = np.sum(self.cat_view(Ytr), axis=0) + prev_test = np.sum(self.cat_view(Yte), axis=0) + accum_tr += prev_train + accum_te += prev_test + in_langs += (prev_train>0)*1 + print(lang+'-train', prev_train) + print(lang+'-test', prev_test) + print('all-train', accum_tr) + print('all-test', accum_te) + + return accum_tr, accum_te, in_langs + + def set_labels(self, labels): + self.labels = labels + +def _mask_numbers(data): + mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') + mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') + mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') + mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') + mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + text = ' ' + text + text = mask_moredigit.sub(' MoreDigitMask', text) + text = mask_4digit.sub(' FourDigitMask', text) + text = mask_3digit.sub(' ThreeDigitMask', text) + text = mask_2digit.sub(' TwoDigitMask', text) + text = mask_1digit.sub(' OneDigitMask', text) + masked.append(text.replace('.','').replace(',','').strip()) + return masked + + + + +# ---------------------------------------------------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------------------------------------------------- +def get_active_labels(doclist): + cat_list = set() + for d in doclist: + cat_list.update(d.categories) + return list(cat_list) + +def filter_by_categories(doclist, keep_categories): + catset = frozenset(keep_categories) + for d in doclist: + d.categories = list(set(d.categories).intersection(catset)) + +def __years_to_str(years): + if isinstance(years, list): + if len(years) > 1: + return str(years[0])+'-'+str(years[-1]) + return str(years[0]) + return str(years) + + +# ---------------------------------------------------------------------------------------------------------------------- +# Matrix builders +# ---------------------------------------------------------------------------------------------------------------------- +def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): + """ + Builds the document-by-term weighted matrices for each language. Representations are independent of each other, + i.e., each language-specific matrix lies in a dedicate feature space. + :param dataset_name: the name of the dataset (str) + :param langs: list of languages (str) + :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param label_names: list of names of labels (str) + :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages + :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) + :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes + by language the processed wikipedia documents in their respective language-specific feature spaces + """ + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + lW = {} + + multilingual_dataset = MultilingualDataset() + multilingual_dataset.dataset_name = dataset_name + multilingual_dataset.set_labels(mlb.classes_) + for lang in langs: + print("\nprocessing %d training, %d test, %d wiki for language <%s>" % + (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) + + tr_data, tr_labels, IDtr = zip(*training_docs[lang]) + te_data, te_labels, IDte = zip(*test_docs[lang]) + + if preprocess: + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, + tokenizer=NLTKStemTokenizer(lang, verbose=True), + stop_words=stopwords.words(NLTK_LANGMAP[lang])) + else: + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) + + Xtr = tfidf.fit_transform(tr_data) + Xte = tfidf.transform(te_data) + if wiki_docs: + lW[lang] = tfidf.transform(wiki_docs[lang]) + + Ytr = mlb.transform(tr_labels) + Yte = mlb.transform(te_labels) + + multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + multilingual_dataset.show_dimensions() + multilingual_dataset.show_category_prevalences() + + if wiki_docs: + return multilingual_dataset, lW + else: + return multilingual_dataset + + +# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space +def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): + """ + Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, + since all of them lie on the same yuxtaposed feature space. + :param dataset_name: the name of the dataset (str) + :param langs: list of languages (str) + :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param label_names: list of names of labels (str) + :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) + :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes + by language the processed wikipedia documents in their respective language-specific feature spaces + """ + + multiling_dataset = MultilingualDataset() + multiling_dataset.dataset_name = dataset_name + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + multiling_dataset.set_labels(mlb.classes_) + + tr_data_stack = [] + for lang in langs: + print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) + tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) + te_data, te_labels, te_ID = zip(*test_docs[lang]) + if preprocess: + tr_data = preprocess_documents(tr_data, lang) + te_data = preprocess_documents(te_data, lang) + tr_data_stack.extend(tr_data) + multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) + + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) + tfidf.fit(tr_data_stack) + + for lang in langs: + print("\nweighting documents for language <%s>" % (lang)) + (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] + Xtr = tfidf.transform(tr_data) + Xte = tfidf.transform(te_data) + Ytr = mlb.transform(tr_labels) + Yte = mlb.transform(te_labels) + multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) + + multiling_dataset.show_dimensions() + return multiling_dataset + + +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to recover the original documents from the MultilingualDataset's ids +# ---------------------------------------------------------------------------------------------------------------------- +""" +This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent +article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents +from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath +""" +def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): + + tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) + assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' + langs = list(tr_ids.keys()) + + print('fetching the datasets') + rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + all_docs = rcv1_documents + rcv2_documents + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + for lang in langs: + analyzer = CountVectorizer(strip_accents='unicode', min_df=3, + stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + dataset.save(outpath) + +""" +Same thing but for JRC-Acquis +""" +def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): + + tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) + assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' + langs = list(tr_ids.keys()) + + print('fetching the datasets') + + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, + cat_filter=cat_list, cat_threshold=1, parallel=None, + most_frequent=most_common_cat) + test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, + parallel='force') + + def filter_by_id(doclist, ids): + ids_set = frozenset(itertools.chain.from_iterable(ids.values())) + return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] + + training_docs = filter_by_id(training_docs, tr_ids) + test_docs = filter_by_id(test_docs, te_ids) + + print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + for lang in langs: + analyzer = CountVectorizer(strip_accents='unicode', min_df=3, + stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + dataset.save(outpath) + +# ---------------------------------------------------------------------------------------------------------------------- +# Dataset Generators +# ---------------------------------------------------------------------------------------------------------------------- +def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + + + """ + Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the + "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. + In all cases, training documents are strictly non-parallel, and test documents are strictly parallel + :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where + all splits will be generated + :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) + :param langs: the list of languages to consider (as defined in data/languages.py) + :param train_years: a list of ints containing the years to be considered as training documents + :param test_years: a list of ints containing the years to be considered as test documents + :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" + (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the + leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details + :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all + :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) + :param run: a numeric label naming the random split (useful to keep track of different runs) + :return: None + """ + + name = 'JRCacquis' + run = '_run' + str(run) + config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ + 'vs' + __years_to_str(test_years) + \ + '_' + cat_policy + \ + ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ + '_noparallel_processed' + + indep_path = join(jrc_data_home, config_name + run + '.pickle') + upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') + yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') + wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') + wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') + + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, + cat_filter=cat_list, cat_threshold=1, parallel=None, + most_frequent=most_common_cat) + test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, + parallel='force') + + print('Generating feature-independent dataset...') + training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) + + def _group_by_lang(doc_list, langs): + return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] + for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) + test_docs = _group_by_lang(test_docs, langs) + if not exists(indep_path): + wiki_docs=None + if max_wiki>0: + if not exists(wiki_docs_path): + wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + + if wiki_docs: + lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) + pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) + + lang_data.save(indep_path) + + print('Generating upper-bound (English-only) dataset...') + if not exists(upper_path): + training_docs_eng_only = {'en':training_docs['en']} + test_docs_eng_only = {'en':test_docs['en']} + build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) + + print('Generating yuxtaposed dataset...') + if not exists(yuxta_path): + build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) + + +def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, + train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + """ + Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the + "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. + + :param outpath: path where all splits will be dumped + :param rcv1_data_home: path to the RCV1-v2 dataset (English only) + :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) + :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) + :param langs: the list of languages to consider (as defined in data/languages.py) + :param train_for_lang: maximum number of training documents per language + :param test_for_lang: maximum number of test documents per language + :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) + :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) + :param run: a numeric label naming the random split (useful to keep track of different runs) + :return: None + """ + + assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' + assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' + assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ + "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" + + name = 'RCV1/2' + run = '_run' + str(run) + config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ + ('_processed' if preprocess else '_raw') + + indep_path = join(outpath, config_name + run + '.pickle') + upper_path = join(outpath, config_name + run +'_upper.pickle') + yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') + wiki_path = join(outpath, config_name + run + '.wiki.pickle') + wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') + + print('fetching the datasets') + rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) + filter_by_categories(rcv1_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_documents+rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} + + # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there + # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases + print('Generating upper-bound (English-only) dataset...') + train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) + train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} + test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} + build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) + + train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] + for lang in langs: + if lang=='en': continue # already split + test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) + train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) + train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] + test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] + + print('Generating feature-independent dataset...') + wiki_docs=None + if max_wiki>0: + if not exists(wiki_docs_path): + wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + + if wiki_docs: + lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) + pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) + + lang_data.save(indep_path) + + print('Generating yuxtaposed dataset...') + build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) + + +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to generate full RCV and JRC datasets +# ---------------------------------------------------------------------------------------------------------------------- +def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): + + + print('fetching the datasets') + rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_train_documents, labels_rcv2) + filter_by_categories(rcv1_test_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_train_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents + lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} + + def get_ids(doclist): + return frozenset([d.id for d in doclist]) + + tr_ids = {'en': get_ids(rcv1_train_documents)} + te_ids = {'en': get_ids(rcv1_test_documents)} + for lang in langs: + if lang == 'en': continue + tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) + + dataset = MultilingualDataset() + dataset.dataset_name = 'RCV1/2-full' + for lang in langs: + print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): + + print('fetching the datasets') + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat + ) + test_docs, _ = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' + ) + + def _group_by_lang(doc_list, langs): + return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + test_docs = _group_by_lang(test_docs, langs) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + data.dataset_name = 'JRC-Acquis-full' + for lang in langs: + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) + Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +#----------------------------------------------------------------------------------------------------------------------- +# MAIN BUILDER +#----------------------------------------------------------------------------------------------------------------------- + +if __name__=='__main__': + import sys + RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = '../Datasets/RCV2' + JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" + full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) + # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) + sys.exit(0) + + # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' + # data = MultilingualDataset.load(datasetpath) + # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' + # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: + # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] + # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) + # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') + # sys.exit(0) + + assert len(sys.argv) == 5, "wrong number of arguments; required: " \ + " " + + JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" + RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' + WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" + + langs = lang_set['JRC_NLTK'] + max_wiki = 5000 + + for run in range(0,10): + print('Building JRC-Acquis datasets run', run) + prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, + train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, + cat_policy='all', most_common_cat=300, run=run) + + print('Building RCV1-v2/2 datasets run', run) + prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], + train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) + + # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE + # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) + # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) + # outpath = datasetpath.replace('_nltk_','_doclist_') + # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) + + # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) + # outpath = datasetpath.replace('_nltk_', '_doclist_') + # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) + + + diff --git a/refactor/data/languages.py b/refactor/data/languages.py new file mode 100644 index 0000000..2d03d8e --- /dev/null +++ b/refactor/data/languages.py @@ -0,0 +1,42 @@ +""" +bg = Bulgarian +cs = Czech +da = Danish +de = German +el = Greek +en = English +es = Spanish +et = Estonian +fi = Finnish +fr = French +hu = Hungarian +it = Italian +lt = Lithuanian +lv = Latvian +nl = Dutch +mt = Maltese +pl = Polish +pt = Portuguese +ro = Romanian +sk = Slovak +sl = Slovene +sv = Swedish +""" + +NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', + 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} + + +#top 10 languages in wikipedia order by the number of articles +#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] + +#all languages in JRC-acquis v3 +JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] +JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' + +RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] +RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] + +lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, + 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} + diff --git a/refactor/data/reader/__init__.py b/refactor/data/reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/refactor/data/reader/jrcacquis_reader.py b/refactor/data/reader/jrcacquis_reader.py new file mode 100644 index 0000000..c0441ed --- /dev/null +++ b/refactor/data/reader/jrcacquis_reader.py @@ -0,0 +1,321 @@ +from __future__ import print_function +import os, sys +from os.path import join +import tarfile +import xml.etree.ElementTree as ET +from sklearn.datasets import get_data_home +import pickle +from util.file import download_file, list_dirs, list_files +import rdflib +from rdflib.namespace import RDF, SKOS +from rdflib import URIRef +import zipfile +from data.languages import JRC_LANGS +from collections import Counter +from random import shuffle +from data.languages import lang_set + +""" +JRC Acquis' Nomenclature: +bg = Bulgarian +cs = Czech +da = Danish +de = German +el = Greek +en = English +es = Spanish +et = Estonian +fi = Finnish +fr = French +hu = Hungarian +it = Italian +lt = Lithuanian +lv = Latvian +nl = Dutch +mt = Maltese +pl = Polish +pt = Portuguese +ro = Romanian +sk = Slovak +sl = Slovene +sv = Swedish +""" + +class JRCAcquis_Document: + def __init__(self, id, name, lang, year, head, body, categories): + self.id = id + self.parallel_id = name + self.lang = lang + self.year = year + self.text = body if not head else head + "\n" + body + self.categories = categories + +# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles +# however, it seems that the title is often appearing as the first paragraph in the text/body (with +# standard codification), so it might be preferable not to read the header after all (as here by default) +def _proc_acute(text): + for ch in ['a','e','i','o','u']: + text = text.replace('%'+ch+'acute%',ch) + return text + +def parse_document(file, year, head=False): + root = ET.parse(file).getroot() + + doc_name = root.attrib['n'] # e.g., '22006A0211(01)' + doc_lang = root.attrib['lang'] # e.g., 'es' + doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' + doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] + doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' + doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) + + def raise_if_empty(field, from_file): + if isinstance(field, str): + if not field.strip(): + raise ValueError("Empty field in file %s" % from_file) + + raise_if_empty(doc_name, file) + raise_if_empty(doc_lang, file) + raise_if_empty(doc_id, file) + if head: raise_if_empty(doc_head, file) + raise_if_empty(doc_body, file) + + return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) + +# removes documents without a counterpart in all other languages +def _force_parallel(doclist, langs): + n_langs = len(langs) + par_id_count = Counter([d.parallel_id for d in doclist]) + parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) + return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] + +def random_sampling_avoiding_parallel(doclist): + random_order = list(range(len(doclist))) + shuffle(random_order) + sampled_request = [] + parallel_ids = set() + for ind in random_order: + pid = doclist[ind].parallel_id + if pid not in parallel_ids: + sampled_request.append(doclist[ind]) + parallel_ids.add(pid) + print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) + return sampled_request + + +#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter +def _filter_by_category(doclist, cat_filter): + if not isinstance(cat_filter, frozenset): + cat_filter = frozenset(cat_filter) + filtered = [] + for doc in doclist: + doc.categories = list(cat_filter & set(doc.categories)) + if doc.categories: + doc.categories.sort() + filtered.append(doc) + print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) + return filtered + +#filters out categories with less than cat_threshold documents (and filters documents containing those categories) +def _filter_by_frequency(doclist, cat_threshold): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +#select top most_frequent categories (and filters documents containing those categories) +def _most_common(doclist, most_frequent): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +def _get_categories(request): + final_cats = set() + for d in request: + final_cats.update(d.categories) + return list(final_cats) + +def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, + parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): + + assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' + if not langs: + langs = JRC_LANGS + else: + if isinstance(langs, str): langs = [langs] + for l in langs: + if l not in JRC_LANGS: + raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) + + if not data_path: + data_path = get_data_home() + + if not os.path.exists(data_path): + os.mkdir(data_path) + + request = [] + total_read = 0 + for l in langs: + file_name = 'jrc-'+l+'.tgz' + archive_path = join(data_path, file_name) + + if not os.path.exists(archive_path): + print("downloading language-specific dataset (once and for all) into %s" % data_path) + DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) + download_file(DOWNLOAD_URL, archive_path) + print("untarring dataset...") + tarfile.open(archive_path, 'r:gz').extractall(data_path) + + documents_dir = join(data_path, l) + + print("Reading documents...") + read = 0 + for dir in list_dirs(documents_dir): + year = int(dir) + if years==None or year in years: + year_dir = join(documents_dir,dir) + pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') + if os.path.exists(pickle_name): + print("loading from file %s" % pickle_name) + l_y_documents = pickle.load(open(pickle_name, "rb")) + read += len(l_y_documents) + else: + l_y_documents = [] + all_documents = list_files(year_dir) + empty = 0 + for i,doc_file in enumerate(all_documents): + try: + jrc_doc = parse_document(join(year_dir, doc_file), year) + except ValueError: + jrc_doc = None + + if jrc_doc and (not ignore_unclassified or jrc_doc.categories): + l_y_documents.append(jrc_doc) + else: empty += 1 + if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): + print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') + read+=1 + print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') + print("\t\t(Pickling object for future runs in %s)" % pickle_name) + pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) + request += l_y_documents + print("Read %d documents for language %s\n" % (read, l)) + total_read += read + print("Read %d documents in total" % (total_read)) + + if parallel=='force': + request = _force_parallel(request, langs) + elif parallel == 'avoid': + request = random_sampling_avoiding_parallel(request) + + final_cats = _get_categories(request) + + if cat_filter: + request = _filter_by_category(request, cat_filter) + final_cats = _get_categories(request) + if cat_threshold > 0: + request, final_cats = _filter_by_frequency(request, cat_threshold) + if most_frequent != -1 and len(final_cats) > most_frequent: + request, final_cats = _most_common(request, most_frequent) + + return request, final_cats + +def print_cat_analysis(request): + cat_count = Counter() + for d in request: + cat_count.update(d.categories) + print("Number of active categories: {}".format(len(cat_count))) + print(cat_count.most_common()) + +# inspects the Eurovoc thesaurus in order to select a subset of categories +# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented +def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', + eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", + select="broadest"): + + fullpath_pickle = join(data_path, select+'_concepts.pickle') + if os.path.exists(fullpath_pickle): + print("Pickled object found in %s. Loading it." % fullpath_pickle) + return pickle.load(open(fullpath_pickle,'rb')) + + fullpath = join(data_path, eurovoc_skos_core_concepts_filename) + if not os.path.exists(fullpath): + print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) + download_file(eurovoc_url, fullpath) + print("Unzipping file...") + zipped = zipfile.ZipFile(data_path + '.zip', 'r') + zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) + zipped.close() + + print("Parsing %s" %fullpath) + g = rdflib.Graph() + g.parse(location=fullpath, format="application/rdf+xml") + + if select == "all": + print("Selecting all concepts") + all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) + all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] + all_concepts.sort() + selected_concepts = all_concepts + elif select=="broadest": + print("Selecting broadest concepts (those without any other broader concept linked to it)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + narrower_concepts = set(g.subjects(SKOS.broader, None)) + broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] + broadest_concepts.sort() + selected_concepts = broadest_concepts + elif select=="leaves": + print("Selecting leaves concepts (those not linked as broader of any other concept)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + broad_concepts = set(g.objects(None, SKOS.broader)) + leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] + leave_concepts.sort() + selected_concepts = leave_concepts + else: + raise ValueError("Selection policy %s is not currently supported" % select) + + print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) + print("Pickling concept list for faster further requests in %s" % fullpath_pickle) + pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) + + return selected_concepts + +if __name__ == '__main__': + + def single_label_fragment(doclist): + single = [d for d in doclist if len(d.categories) < 2] + final_categories = set([d.categories[0] if d.categories else [] for d in single]) + print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), + len(final_categories), + len(doclist))) + return single, list(final_categories) + + train_years = list(range(1986, 2006)) + test_years = [2006] + cat_policy = 'leaves' + most_common_cat = 300 + # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" + JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" + langs = lang_set['JRC_NLTK'] + cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) + sys.exit() + + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) + test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') + + print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) + print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) + + training_docs, label_names = single_label_fragment(training_docs) + test_docs, label_namestest = single_label_fragment(test_docs) + + print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) + print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) + + diff --git a/refactor/data/reader/rcv_reader.py b/refactor/data/reader/rcv_reader.py new file mode 100644 index 0000000..cd4b416 --- /dev/null +++ b/refactor/data/reader/rcv_reader.py @@ -0,0 +1,225 @@ +from zipfile import ZipFile +import xml.etree.ElementTree as ET +from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS +from util.file import list_files +from sklearn.datasets import get_data_home +import gzip +from os.path import join, exists +from util.file import download_file_if_not_exists +import re +from collections import Counter +import numpy as np +import sys + +""" +RCV2's Nomenclature: +ru = Russian +da = Danish +de = German +es = Spanish +lat = Spanish Latin-American (actually is also 'es' in the collection) +fr = French +it = Italian +nl = Dutch +pt = Portuguese +sv = Swedish +ja = Japanese +htw = Chinese +no = Norwegian +""" + +RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" +RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' +RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" +RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" + +rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', + 'lyrl2004_tokens_test_pt1.dat.gz', + 'lyrl2004_tokens_test_pt2.dat.gz', + 'lyrl2004_tokens_test_pt3.dat.gz'] + +rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] + +rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' + +RCV2_LANG_DIR = {'ru':'REUTE000', + 'de':'REUTE00A', + 'fr':'REUTE00B', + 'sv':'REUTE001', + 'no':'REUTE002', + 'da':'REUTE003', + 'pt':'REUTE004', + 'it':'REUTE005', + 'es':'REUTE006', + 'lat':'REUTE007', + 'jp':'REUTE008', + 'htw':'REUTE009', + 'nl':'REUTERS_'} + + +class RCV_Document: + + def __init__(self, id, text, categories, date='', lang=None): + self.id = id + self.date = date + self.lang = lang + self.text = text + self.categories = categories + + +class ExpectedLanguageException(Exception): pass +class IDRangeException(Exception): pass + + +nwords = [] + +def parse_document(xml_content, assert_lang=None, valid_id_range=None): + root = ET.fromstring(xml_content) + if assert_lang: + if assert_lang not in root.attrib.values(): + if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' + raise ExpectedLanguageException('error: document of a different language') + + doc_id = root.attrib['itemid'] + if valid_id_range is not None: + if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: + raise IDRangeException + + doc_categories = [cat.attrib['code'] for cat in + root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] + + doc_date = root.attrib['date'] + doc_title = root.find('.//title').text + doc_headline = root.find('.//headline').text + doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) + + if not doc_body: + raise ValueError('Empty document') + + if doc_title is None: doc_title = '' + if doc_headline is None or doc_headline in doc_title: doc_headline = '' + text = '\n'.join([doc_title, doc_headline, doc_body]).strip() + + text_length = len(text.split()) + global nwords + nwords.append(text_length) + + return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) + + +def fetch_RCV1(data_path, split='all'): + + assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' + + request = [] + labels = set() + read_documents = 0 + lang = 'en' + + training_documents = 23149 + test_documents = 781265 + + if split == 'all': + split_range = (2286, 810596) + expected = training_documents+test_documents + elif split == 'train': + split_range = (2286, 26150) + expected = training_documents + else: + split_range = (26151, 810596) + expected = test_documents + + global nwords + nwords=[] + for part in list_files(data_path): + if not re.match('\d+\.zip', part): continue + target_file = join(data_path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) + labels.update(doc.categories) + request.append(doc) + read_documents += 1 + except ValueError: + print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) + except (IDRangeException, ExpectedLanguageException) as e: + pass + print('\r[{}] read {} documents'.format(part, len(request)), end='') + if read_documents == expected: break + if read_documents == expected: break + print() + print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + return request, list(labels) + + +def fetch_RCV2(data_path, languages=None): + + if not languages: + languages = list(RCV2_LANG_DIR.keys()) + else: + assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' + + request = [] + labels = set() + global nwords + nwords=[] + for lang in languages: + path = join(data_path, RCV2_LANG_DIR[lang]) + lang_docs_read = 0 + for part in list_files(path): + target_file = join(path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, assert_lang=lang) + labels.update(doc.categories) + request.append(doc) + lang_docs_read += 1 + except ValueError: + print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) + except (IDRangeException, ExpectedLanguageException) as e: + pass + print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') + print() + print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + return request, list(labels) + + +def fetch_topic_hierarchy(path, topics='all'): + assert topics in ['all', 'leaves'] + + download_file_if_not_exists(RCV1_TOPICHIER_URL, path) + hierarchy = {} + for line in open(path, 'rt'): + parts = line.strip().split() + parent,child = parts[1],parts[3] + if parent not in hierarchy: + hierarchy[parent]=[] + hierarchy[parent].append(child) + + del hierarchy['None'] + del hierarchy['Root'] + print(hierarchy) + + if topics=='all': + topics = set(hierarchy.keys()) + for parent in hierarchy.keys(): + topics.update(hierarchy[parent]) + return list(topics) + elif topics=='leaves': + parents = set(hierarchy.keys()) + childs = set() + for parent in hierarchy.keys(): + childs.update(hierarchy[parent]) + return list(childs.difference(parents)) + + diff --git a/refactor/data/reader/wikipedia_tools.py b/refactor/data/reader/wikipedia_tools.py new file mode 100644 index 0000000..83e11e3 --- /dev/null +++ b/refactor/data/reader/wikipedia_tools.py @@ -0,0 +1,304 @@ +from __future__ import print_function +# import ijson +# from ijson.common import ObjectBuilder +import os, sys +from os.path import join +from bz2 import BZ2File +import pickle +from util.file import list_dirs, list_files, makedirs_if_not_exist +from itertools import islice +import re +from xml.sax.saxutils import escape +import numpy as np + +policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] + +""" +This file contains a set of tools for processing the Wikipedia multilingual documents. +In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) +and have processed each document to clean their texts with one of the tools: + - https://github.com/aesuli/wikipediatools (Python 2) + - https://github.com/aesuli/wikipedia-extractor (Python 3) +It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) + +This tools help you in: + - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. + Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" + extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). + Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". + - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. + - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all + language-specific versions from the document. + - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, + in a way that the i-th element from any list refers to the same element in the respective language. +""" + +def _doc_generator(text_path, langs): + dotspace = re.compile(r'\.(?!\s)') + for l,lang in enumerate(langs): + print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) + lang_dir = join(text_path, lang) + split_dirs = list_dirs(lang_dir) + for sd,split_dir in enumerate(split_dirs): + print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) + split_files = list_files(join(lang_dir, split_dir)) + for sf,split_file in enumerate(split_files): + print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) + with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: + while True: + doc_lines = list(islice(fi, 3)) + if doc_lines: + # some sentences are not followed by a space after the dot + doc_lines[1] = dotspace.sub('. ', doc_lines[1]) + # [workaround] I found   html symbol was not treated, and unescaping it now might not help... + doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) + yield doc_lines, lang + else: break + +def _extract_title(doc_lines): + m = re.search('title="(.+?)"', doc_lines[0]) + if m: return m.group(1).decode('utf-8') + else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) + +def _create_doc(target_file, id, doc, lang): + doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) + with open(target_file, 'w') as fo: + fo.write('\n'%id) + [fo.write(line) for line in doc] + fo.write('') + +def _append_doc(target_file, doc, lang): + doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) + with open(target_file, 'r', buffering=1024*1024) as fi: + lines = fi.readlines() + if doc[0] in lines[1::3]: + return + lines[-1:-1]=doc + with open(target_file, 'w', buffering=1024*1024) as fo: + [fo.write(line) for line in lines] + +def extract_multilingual_documents(inv_dict, langs, text_path, out_path): + if not os.path.exists(out_path): + os.makedirs(out_path) + for lang in langs: + if lang not in inv_dict: + raise ValueError("Lang %s is not in the dictionary" % lang) + + docs_created = len(list_files(out_path)) + print("%d multilingual documents found." % docs_created) + for doc,lang in _doc_generator(text_path, langs): + title = _extract_title(doc) + + if title in inv_dict[lang]: + #pass + ids = inv_dict[lang][title] + for id in ids: + target_file = join(out_path, id) + ".xml" + if os.path.exists(target_file): + _append_doc(target_file, doc, lang) + else: + _create_doc(target_file, id, doc, lang) + docs_created+=1 + else: + if not re.match('[A-Za-z]+', title): + print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) + + + +def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): + simplified_file = join(data_dir,filename) + + if policy not in policies: + raise ValueError("Policy %s not supported." % policy) + print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) + + lang_prefix = list(langs) + lang_prefix.sort() + pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy + pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") + pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") + if os.path.exists(pickle_invdict): + if return_both and os.path.exists(pickle_dict): + print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) + return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) + elif return_both==False: + print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) + return pickle.load(open(pickle_invdict, 'rb')) + + multiling_titles = {} + inv_dict = {lang:{} for lang in langs} + + def process_entry(line): + parts = line.strip().split('\t') + id = parts[0] + if id in multiling_titles: + raise ValueError("id <%s> already indexed" % id) + + titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) + for lang in titles.keys(): + if lang not in langs: + del titles[lang] + + if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ + or (policy == "IN_ANY_LANG" and len(titles) > 0): + multiling_titles[id] = titles + for lang, title in titles.items(): + if title in inv_dict[lang]: + inv_dict[lang][title].append(id) + inv_dict[lang][title] = [id] + + with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: + completed = 0 + try: + for line in fi: + process_entry(line) + completed += 1 + if completed % 10 == 0: + print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") + print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") + except EOFError: + print("\nUnexpected file ending... saving anyway") + + print("Pickling dictionaries in %s" % data_dir) + pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) + pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) + print("Done") + + return (multiling_titles, inv_dict) if return_both else inv_dict + + +# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 +def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): + latest_all_json_file = join(data_dir,json_file) + + if policy not in policies: + raise ValueError("Policy %s not supported." % policy) + + print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) + + lang_prefix = list(langs) + lang_prefix.sort() + simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) + + def process_entry(last, fo): + global written + id = last["id"] + titles = None + if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): + titles = {lang: last["labels"][lang]["value"] for lang in langs} + elif policy == "IN_ANY_LANG": + titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} + + if titles: + fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) + return True + else: + return False + + written = 0 + with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ + BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: + builder = ObjectBuilder() + completed = 0 + for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): + builder.event(event, value) + if len(builder.value)>1: + if process_entry(builder.value.pop(0), fo): written += 1 + completed += 1 + print("\rCompleted %d\ttitles %d" % (completed,written), end="") + print("") + + #process the last entry + process_entry(builder.value.pop(0)) + + return simple_titles_path + +""" +Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the +specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- +specific version of the same document. Documents are forced to contain version in all specified languages and to contain +a minimum number of words; otherwise it is discarded. +""" +class MinWordsNotReached(Exception): pass +class WrongDocumentFormat(Exception): pass + +def _load_multilang_doc(path, langs, min_words=100): + import xml.etree.ElementTree as ET + from xml.etree.ElementTree import Element, ParseError + try: + root = ET.parse(path).getroot() + doc = {} + for lang in langs: + doc_body = root.find('.//doc[@lang="' + lang + '"]') + if isinstance(doc_body, Element): + n_words = len(doc_body.text.split(' ')) + if n_words >= min_words: + doc[lang] = doc_body.text + else: + raise MinWordsNotReached + else: + raise WrongDocumentFormat + except ParseError: + raise WrongDocumentFormat + return doc + +#returns the multilingual documents mapped by language, and a counter with the number of documents readed +def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): + if pickle_name and os.path.exists(pickle_name): + print("unpickling %s" % pickle_name) + return pickle.load(open(pickle_name, 'rb')) + + multi_docs = list_files(wiki_multi_path) + mling_documents = {l:[] for l in langs} + valid_documents = 0 + minwords_exception = 0 + wrongdoc_exception = 0 + for d,multi_doc in enumerate(multi_docs): + print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % + (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") + doc_path = join(wiki_multi_path, multi_doc) + try: + m_doc = _load_multilang_doc(doc_path, langs, min_words) + valid_documents += 1 + for l in langs: + mling_documents[l].append(m_doc[l]) + except MinWordsNotReached: + minwords_exception += 1 + if deletions: os.remove(doc_path) + except WrongDocumentFormat: + wrongdoc_exception += 1 + if deletions: os.remove(doc_path) + if max_documents>0 and valid_documents>=max_documents: + break + + if pickle_name: + print("Pickling wikipedia documents object in %s" % pickle_name) + pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) + + return mling_documents + +def random_wiki_sample(l_wiki, max_documents): + if max_documents == 0: return None + langs = list(l_wiki.keys()) + assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' + ndocs_per_lang = len(l_wiki[langs[0]]) + if ndocs_per_lang > max_documents: + sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) + for lang in langs: + l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] + return l_wiki + + +if __name__ == "__main__": + + wikipedia_home = "../Datasets/Wikipedia" + + from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs + langs = frozenset(langs) + + simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") + _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') + extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), + out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) + + diff --git a/refactor/data/text_preprocessor.py b/refactor/data/text_preprocessor.py new file mode 100644 index 0000000..1a6e3ae --- /dev/null +++ b/refactor/data/text_preprocessor.py @@ -0,0 +1,33 @@ +from nltk.corpus import stopwords +from data.languages import NLTK_LANGMAP +from nltk import word_tokenize +from nltk.stem import SnowballStemmer + + +def preprocess_documents(documents, lang): + tokens = NLTKStemTokenizer(lang, verbose=True) + sw = stopwords.words(NLTK_LANGMAP[lang]) + return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] + + +class NLTKStemTokenizer(object): + + def __init__(self, lang, verbose=False): + if lang not in NLTK_LANGMAP: + raise ValueError('Language %s is not supported in NLTK' % lang) + self.verbose=verbose + self.called = 0 + self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) + self.cache = {} + + def __call__(self, doc): + self.called += 1 + if self.verbose: + print("\r\t\t[documents processed %d]" % (self.called), end="") + tokens = word_tokenize(doc) + stems = [] + for t in tokens: + if t not in self.cache: + self.cache[t] = self.wnl.stem(t) + stems.append(self.cache[t]) + return stems \ No newline at end of file diff --git a/refactor/data/tsr_function__.py b/refactor/data/tsr_function__.py new file mode 100755 index 0000000..0af8690 --- /dev/null +++ b/refactor/data/tsr_function__.py @@ -0,0 +1,270 @@ +import math +import numpy as np +from scipy.stats import t +from joblib import Parallel, delayed +from scipy.sparse import csr_matrix, csc_matrix + + +def get_probs(tpr, fpr, pc): + # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) + # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) + pnc = 1.0 - pc + tp = tpr * pc + fn = pc - tp + fp = fpr * pnc + tn = pnc - fp + return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) + + +def apply_tsr(tpr, fpr, pc, tsr): + cell = get_probs(tpr, fpr, pc) + return tsr(cell) + + +def positive_information_gain(cell): + if cell.tpr() < cell.fpr(): + return 0.0 + else: + return information_gain(cell) + + +def posneg_information_gain(cell): + ig = information_gain(cell) + if cell.tpr() < cell.fpr(): + return -ig + else: + return ig + + +def __ig_factor(p_tc, p_t, p_c): + den = p_t * p_c + if den != 0.0 and p_tc != 0: + return p_tc * math.log(p_tc / den, 2) + else: + return 0.0 + + +def information_gain(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ + __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) + + +def information_gain_mod(cell): + return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ + - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) + + +def pointwise_mutual_information(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + + +def gain_ratio(cell): + pc = cell.p_c() + pnc = 1.0 - pc + norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) + return information_gain(cell) / (-norm) + + +def chi_square(cell): + den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() + if den==0.0: return 0.0 + num = gss(cell)**2 + return num / den + + +def relevance_frequency(cell): + a = cell.tp + c = cell.fp + if c == 0: c = 1 + return math.log(2.0 + (a * 1.0 / c), 2) + + +def idf(cell): + if cell.p_f()>0: + return math.log(1.0 / cell.p_f()) + return 0.0 + + +def gss(cell): + return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() + + +def conf_interval(xt, n): + if n>30: + z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 + else: + z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 + p = (xt + 0.5 * z2) / (n + z2) + amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) + return p, amplitude + +def strength(minPosRelFreq, minPos, maxNeg): + if minPos > maxNeg: + return math.log(2.0 * minPosRelFreq, 2.0) + else: + return 0.0 + + +#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) +#however, for some extremely imbalanced dataset caused all documents to be 0 +def conf_weight(cell, cancel_features=False): + c = cell.get_c() + not_c = cell.get_not_c() + tp = cell.tp + fp = cell.fp + + pos_p, pos_amp = conf_interval(tp, c) + neg_p, neg_amp = conf_interval(fp, not_c) + + min_pos = pos_p-pos_amp + max_neg = neg_p+neg_amp + den = (min_pos + max_neg) + minpos_relfreq = min_pos / (den if den != 0 else 1) + + str_tplus = strength(minpos_relfreq, min_pos, max_neg); + + if str_tplus == 0 and not cancel_features: + return 1e-20 + + return str_tplus; + + +class ContTable: + + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp=tp + self.tn=tn + self.fp=fp + self.fn=fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + +def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): + print(f'[selectiong {k} terms]') + nC = Y.shape[1] + FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T + best_features_idx = np.argsort(-FC, axis=0).flatten() + tsr_values = FC.flatten() + selected_indexes_set = set() + selected_indexes = list() + selected_value = list() + from_category = list() + round_robin = iter(best_features_idx) + values_iter = iter(tsr_values) + round=0 + while len(selected_indexes) < k: + term_idx = next(round_robin) + term_val = next(values_iter) + if term_idx not in selected_indexes_set: + selected_indexes_set.add(term_idx) + selected_indexes.append(term_idx) + selected_value.append(term_val) + from_category.append(round) + round = (round + 1) % nC + return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) + + +def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): + tp_ = len(positive_document_indexes & feature_document_indexes) + fp_ = len(feature_document_indexes - positive_document_indexes) + fn_ = len(positive_document_indexes - feature_document_indexes) + tn_ = nD - (tp_ + fp_ + fn_) + return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) + + +def category_tables(feature_sets, category_sets, c, nD, nF): + return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] + + +""" +Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. +Efficiency O(nF x nC x log(S)) where S is the sparse factor +""" +def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): + nD, nF = coocurrence_matrix.shape + nD2, nC = label_matrix.shape + + if nD != nD2: + raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % + (coocurrence_matrix.shape,label_matrix.shape)) + + def nonzero_set(matrix, col): + return set(matrix[:, col].nonzero()[0]) + + if isinstance(coocurrence_matrix, csr_matrix): + coocurrence_matrix = csc_matrix(coocurrence_matrix) + feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] + category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] + cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) + return np.array(cell_matrix) + +# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f +def get_tsr_matrix(cell_matrix, tsr_score_funtion): + nC,nF = cell_matrix.shape + tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] + return np.array(tsr_matrix) + + +""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can +take as input any real-valued feature column (e.g., tf-idf weights). +feat is the feature vector, and c is a binary classification vector. +This implementation covers only the binary case, while the formula is defined for multiclass +single-label scenarios, for which the version [2] might be preferred. +[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. +[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. +""" +def fisher_score_binary(feat, c): + neg = np.ones_like(c) - c + + npos = np.sum(c) + nneg = np.sum(neg) + + mupos = np.mean(feat[c == 1]) + muneg = np.mean(feat[neg == 1]) + mu = np.mean(feat) + + stdpos = np.std(feat[c == 1]) + stdneg = np.std(feat[neg == 1]) + + num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) + den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) + + if den>0: + return num / den + else: + return num diff --git a/refactor/debug_notebook.ipynb b/refactor/debug_notebook.ipynb new file mode 100644 index 0000000..f574694 --- /dev/null +++ b/refactor/debug_notebook.ipynb @@ -0,0 +1,36 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/refactor/devel_ideas.py b/refactor/devel_ideas.py new file mode 100644 index 0000000..bf5690a --- /dev/null +++ b/refactor/devel_ideas.py @@ -0,0 +1,95 @@ +class CustomMetrics(Metric): + def __init__( + self, + num_classes: int, + beta: float = 1.0, + threshold: float = 0.5, + average: str = "micro", + multilabel: bool = False, + compute_on_step: bool = True, + dist_sync_on_step: bool = False, + process_group: Optional[Any] = None, + ): + super().__init__( + compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + ) + + self.num_classes = num_classes + self.beta = beta + self.threshold = threshold + self.average = average + self.multilabel = multilabel + + allowed_average = ("micro", "macro", "weighted", None) + if self.average not in allowed_average: + raise ValueError('Argument `average` expected to be one of the following:' + f' {allowed_average} but got {self.average}') + + self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor): + """ + Update state with predictions and targets. + + Args: + preds: Predictions from model + target: Ground truth values + """ + true_positives, predicted_positives, actual_positives = _fbeta_update( + preds, target, self.num_classes, self.threshold, self.multilabel + ) + + self.true_positives += true_positives + self.predicted_positives += predicted_positives + self.actual_positives += actual_positives + + def compute(self): + """ + Computes metrics over state. + """ + return _fbeta_compute(self.true_positives, self.predicted_positives, + self.actual_positives, self.beta, self.average) + + +def _fbeta_update( + preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + threshold: float = 0.5, + multilabel: bool = False +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + preds, target = _input_format_classification_one_hot( + num_classes, preds, target, threshold, multilabel + ) + true_positives = torch.sum(preds * target, dim=1) + predicted_positives = torch.sum(preds, dim=1) + actual_positives = torch.sum(target, dim=1) + return true_positives, predicted_positives, actual_positives + + +def _fbeta_compute( + true_positives: torch.Tensor, + predicted_positives: torch.Tensor, + actual_positives: torch.Tensor, + beta: float = 1.0, + average: str = "micro" +) -> torch.Tensor: + if average == "micro": + precision = true_positives.sum().float() / predicted_positives.sum() + recall = true_positives.sum().float() / actual_positives.sum() + else: + precision = true_positives.float() / predicted_positives + recall = true_positives.float() / actual_positives + + num = (1 + beta ** 2) * precision * recall + denom = beta ** 2 * precision + recall + new_num = 2 * true_positives + new_fp = predicted_positives - true_positives + new_fn = actual_positives - true_positives + new_den = 2 * true_positives + new_fp + new_fn + if new_den.sum() == 0: + # whats is the correct return type ? TODO + return 1. + return class_reduce(num, denom, weights=actual_positives, class_reduction=average) diff --git a/refactor/main.py b/refactor/main.py new file mode 100644 index 0000000..76c5e54 --- /dev/null +++ b/refactor/main.py @@ -0,0 +1,47 @@ +from argparse import ArgumentParser +from util.embeddings_manager import MuseLoader +from view_generators import RecurrentGen, BertGen +from data.dataset_builder import MultilingualDataset +from util.common import MultilingualIndex + + +def main(args): + N_JOBS = 8 + print('Running...') + + # _DATASET = '/homenfs/a.pedrotti1/datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' + # EMBEDDINGS_PATH = '/homenfs/a.pedrotti1/embeddings/MUSE' + + _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' + EMBEDDINGS_PATH = '/home/andreapdr/funneling_pdr/embeddings' + data = MultilingualDataset.load(_DATASET) + # data.set_view(languages=['it']) + lX, ly = data.training() + lXte, lyte = data.test() + + # Init multilingualIndex - mandatory when deploying Neural View Generators... + multilingualIndex = MultilingualIndex() + # lMuse = MuseLoader(langs=sorted(lX.keys()), cache=) + lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) + multilingualIndex.index(lX, ly, lXte, l_pretrained_vocabulary=lMuse.vocabulary()) + + # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) + # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) + # gFun = WordClassGen(n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, gpus=args.gpus, n_jobs=N_JOBS, + stored_path='/home/andreapdr/gfun_refactor/tb_logs/gfun_rnn_dev/version_19/checkpoints/epoch=0-step=14.ckpt') + # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) + + gFun.fit(lX, ly) + + # print('Projecting...') + # y_ = gFun.transform(lX) + + exit('Executed!') + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--gpus', default=None) + args = parser.parse_args() + main(args) diff --git a/refactor/models/helpers.py b/refactor/models/helpers.py new file mode 100755 index 0000000..93e5805 --- /dev/null +++ b/refactor/models/helpers.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + + + +def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): + pretrained_embeddings = None + pretrained_length = 0 + if pretrained is not None: + pretrained_length = pretrained.shape[1] + assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' + pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) + # pretrained_embeddings.to(device) + + learnable_embeddings = None + if learnable_length > 0: + learnable_embeddings = nn.Embedding(vocab_size, learnable_length) + # learnable_embeddings.to(device) + + embedding_length = learnable_length + pretrained_length + assert embedding_length > 0, '0-size embeddings' + + return pretrained_embeddings, learnable_embeddings, embedding_length + + +def embed(model, input, lang): + input_list = [] + if model.lpretrained_embeddings[lang]: + input_list.append(model.lpretrained_embeddings[lang](input)) + if model.llearnable_embeddings[lang]: + input_list.append(model.llearnable_embeddings[lang](input)) + return torch.cat(tensors=input_list, dim=2) + + +def embedding_dropout(input, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from #length of the supervised embedding + l = input.shape[2] #total embedding length + corr = (1 - p) + input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) + input /= (1 - (p * m / l)) + + return input diff --git a/refactor/models/learners.py b/refactor/models/learners.py new file mode 100644 index 0000000..fcd4249 --- /dev/null +++ b/refactor/models/learners.py @@ -0,0 +1,185 @@ +import numpy as np +import time +from scipy.sparse import issparse +from sklearn.multiclass import OneVsRestClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.svm import SVC +from joblib import Parallel, delayed + + +def get_learner(calibrate=False, kernel='linear', C=1): + """ + instantiate scikit Support Vector Classifier + :param calibrate: boolean, whether to return posterior probabilities or not + :param kernel: string,kernel to be applied to the SVC + :param C: int or dict {'C': list of integer}, Regularization parameter + :return: Support Vector Classifier + """ + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) + + +def _sort_if_sparse(X): + if issparse(X) and not X.has_sorted_indices: + X.sort_indices() + + +def _joblib_transform_multiling(transformer, lX, n_jobs=-1): + if n_jobs == 1: + return {lang: transformer(lX[lang]) for lang in lX.keys()} + else: + langs = list(lX.keys()) + transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) + return {lang: transformations[i] for i, lang in enumerate(langs)} + + +class TrivialRejector: + def fit(self, X, y): + self.cats = y.shape[1] + return self + + def decision_function(self, X): return np.zeros((X.shape[0], self.cats)) + + def predict(self, X): return np.zeros((X.shape[0], self.cats)) + + def predict_proba(self, X): return np.zeros((X.shape[0], self.cats)) + + def best_params(self): return {} + + +class NaivePolylingualClassifier: + """ + Is a mere set of independet MonolingualClassifiers + """ + + def __init__(self, base_learner, parameters=None, n_jobs=-1): + self.base_learner = base_learner + self.parameters = parameters + self.model = None + self.n_jobs = n_jobs + + def fit(self, lX, ly): + """ + trains the independent monolingual classifiers + :param lX: a dictionary {language_label: X csr-matrix} + :param ly: a dictionary {language_label: y np.array} + :return: self + """ + tinit = time.time() + assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' + langs = list(lX.keys()) + for lang in langs: + _sort_if_sparse(lX[lang]) + + models = Parallel(n_jobs=self.n_jobs)\ + (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for + lang in langs) + + self.model = {lang: models[i] for i, lang in enumerate(langs)} + self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs} + self.time = time.time() - tinit + return self + + def decision_function(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of classification scores for each class + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def predict_proba(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of probabilities that each document belongs to each class + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def predict(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of predictions + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' + if self.n_jobs == 1: + return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()} + else: + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def best_params(self): + return {lang: model.best_params() for lang, model in self.model.items()} + + +class MonolingualClassifier: + + def __init__(self, base_learner, parameters=None, n_jobs=-1): + self.learner = base_learner + self.parameters = parameters + self.model = None + self.n_jobs = n_jobs + self.best_params_ = None + + def fit(self, X, y): + if X.shape[0] == 0: + print('Warning: X has 0 elements, a trivial rejector will be created') + self.model = TrivialRejector().fit(X, y) + self.empty_categories = np.arange(y.shape[1]) + return self + + tinit = time.time() + _sort_if_sparse(X) + self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten() + # multi-class format + if len(y.shape) == 2: + if self.parameters is not None: + self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} + for params in self.parameters] + self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) + else: + self.model = self.learner + raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' + 'the labels across languages') + + # parameter optimization? + if self.parameters: + print('debug: optimizing parameters:', self.parameters) + self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, + error_score=0, verbose=10) + + # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') + print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') + self.model.fit(X, y) + if isinstance(self.model, GridSearchCV): + self.best_params_ = self.model.best_params_ + print('best parameters: ', self.best_params_) + self.time = time.time() - tinit + return self + + def decision_function(self, X): + assert self.model is not None, 'predict called before fit' + _sort_if_sparse(X) + return self.model.decision_function(X) + + def predict_proba(self, X): + assert self.model is not None, 'predict called before fit' + assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' + _sort_if_sparse(X) + return self.model.predict_proba(X) + + def predict(self, X): + assert self.model is not None, 'predict called before fit' + _sort_if_sparse(X) + return self.model.predict(X) + + def best_params(self): + return self.best_params_ diff --git a/refactor/models/lstm_class.py b/refactor/models/lstm_class.py new file mode 100755 index 0000000..98424f1 --- /dev/null +++ b/refactor/models/lstm_class.py @@ -0,0 +1,114 @@ +#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py +import torch +import torch.nn as nn +from torch.autograd import Variable +from models.helpers import * + + +class RNNMultilingualClassifier(nn.Module): + + def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, + drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, + bert_embeddings=False): + + super(RNNMultilingualClassifier, self).__init__() + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.post_probabilities = post_probabilities + self.bert_embeddings = bert_embeddings + assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' + + self.lpretrained_embeddings = nn.ModuleDict() + self.llearnable_embeddings = nn.ModuleDict() + self.embedding_length = None + self.langs = sorted(lvocab_size.keys()) + self.only_post = only_post + + self.n_layers = 1 + self.n_directions = 1 + + self.dropout = nn.Dropout(0.6) + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + if only_post==False: + for l in self.langs: + pretrained = lpretrained[l] if lpretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, lvocab_size[l], learnable_length + ) + lpretrained_embeddings[l] = pretrained_embeddings + llearnable_embeddings[l] = learnable_embeddings + self.embedding_length = embedding_length + + # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.lpretrained_embeddings.update(lpretrained_embeddings) + self.llearnable_embeddings.update(llearnable_embeddings) + + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + + if only_post: + self.label = nn.Linear(output_size, output_size) + elif post_probabilities and not bert_embeddings: + self.label = nn.Linear(ff2 + output_size, output_size) + elif bert_embeddings and not post_probabilities: + self.label = nn.Linear(ff2 + 768, output_size) + elif post_probabilities and bert_embeddings: + self.label = nn.Linear(ff2 + output_size + 768, output_size) + else: + self.label = nn.Linear(ff2, output_size) + + def forward(self, input, post, bert_embed, lang): + if self.only_post: + doc_embedding = post + else: + doc_embedding = self.transform(input, lang) + if self.post_probabilities: + doc_embedding = torch.cat([doc_embedding, post], dim=1) + if self.bert_embeddings: + doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) + + logits = self.label(doc_embedding) + return logits + + def transform(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # output, (_, _) = self.lstm(input, (h_0, c_0)) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def finetune_pretrained(self): + for l in self.langs: + self.lpretrained_embeddings[l].requires_grad = True + self.lpretrained_embeddings[l].weight.requires_grad = True + + def get_embeddings(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + return output.cpu().detach().numpy() + diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py new file mode 100644 index 0000000..4561004 --- /dev/null +++ b/refactor/models/pl_bert.py @@ -0,0 +1,64 @@ +import torch +import pytorch_lightning as pl +from torch.optim.lr_scheduler import StepLR +from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig +from pytorch_lightning.metrics import F1, Accuracy, Metric + + +class BertModel(pl.LightningModule): + + def __init__(self, output_size, stored_path): + super().__init__() + self.loss = torch.nn.BCEWithLogitsLoss() + if stored_path: + self.bert = BertForSequenceClassification.from_pretrained(stored_path, + num_labels=output_size, + output_hidden_states=True) + else: + self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', + num_labels=output_size, + output_hidden_states=True) + self.accuracy = Accuracy() + self.save_hyperparameters() + + def forward(self, X): + logits = self.bert(X) + return logits + + def training_step(self, train_batch, batch_idx): + X, y, _, batch_langs = train_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.cuda.FloatTensor) + logits, _ = self.forward(X) + loss = self.loss(logits, y) + predictions = torch.sigmoid(logits) > 0.5 + accuracy = self.accuracy(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return loss + + def validation_step(self, val_batch, batch_idx): + X, y, _, batch_langs = val_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.cuda.FloatTensor) + logits, _ = self.forward(X) + loss = self.loss(logits, y) + predictions = torch.sigmoid(logits) > 0.5 + accuracy = self.accuracy(predictions, y) + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return + + def configure_optimizers(self, lr=3e-5, weight_decay=0.01): + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in self.bert.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay}, + {'params': [p for n, p in self.bert.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=lr) + scheduler = StepLR(optimizer, step_size=25, gamma=0.1) + return [optimizer], [scheduler] diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py new file mode 100644 index 0000000..7987156 --- /dev/null +++ b/refactor/models/pl_gru.py @@ -0,0 +1,312 @@ +import torch +from torch import nn +from torch.optim import Adam +from transformers import AdamW +import torch.nn.functional as F +from torch.autograd import Variable +import pytorch_lightning as pl +from pytorch_lightning.metrics import F1, Accuracy, Metric +from torch.optim.lr_scheduler import StepLR + +from util.evaluation import evaluate +from typing import Any, Optional, Tuple +from pytorch_lightning.metrics.utils import _input_format_classification_one_hot, class_reduce +import numpy as np + + +def init_embeddings(pretrained, vocab_size, learnable_length): + """ + Compute the embedding matrix + :param pretrained: + :param vocab_size: + :param learnable_length: + :return: + """ + pretrained_embeddings = None + pretrained_length = 0 + if pretrained is not None: + pretrained_length = pretrained.shape[1] + assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' + pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + # requires_grad=False sets the embedding layer as NOT trainable + pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) + + learnable_embeddings = None + if learnable_length > 0: + learnable_embeddings = nn.Embedding(vocab_size, learnable_length) + + embedding_length = learnable_length + pretrained_length + assert embedding_length > 0, '0-size embeddings' + return pretrained_embeddings, learnable_embeddings, embedding_length + + +class RecurrentModel(pl.LightningModule): + """ + Check out for logging insight https://www.learnopencv.com/tensorboard-with-pytorch-lightning/ + """ + + def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, + drop_embedding_range, drop_embedding_prop, lMuse_debug=None, multilingual_index_debug=None): + super().__init__() + self.langs = langs + self.lVocab_size = lVocab_size + self.learnable_length = learnable_length + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.loss = torch.nn.BCEWithLogitsLoss() + self.microf1 = F1(num_classes=output_size, multilabel=True, average='micro') + self.macrof1 = F1(num_classes=output_size, multilabel=True, average='macro') + self.accuracy = Accuracy() + self.customMetrics = CustomMetrics(num_classes=output_size, multilabel=True, average='micro') + + self.lPretrained_embeddings = nn.ModuleDict() + self.lLearnable_embeddings = nn.ModuleDict() + + self.n_layers = 1 + self.n_directions = 1 + self.dropout = nn.Dropout(0.6) + + # TODO: debug setting + self.lMuse = lMuse_debug + self.multilingual_index_debug = multilingual_index_debug + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + + for lang in self.langs: + pretrained = lPretrained[lang] if lPretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, self.lVocab_size[lang], self.learnable_length) + lpretrained_embeddings[lang] = pretrained_embeddings + llearnable_embeddings[lang] = learnable_embeddings + self.embedding_length = embedding_length + + self.lPretrained_embeddings.update(lpretrained_embeddings) + self.lLearnable_embeddings.update(llearnable_embeddings) + + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + self.label = nn.Linear(ff2, self.output_size) + + lPretrained = None # TODO: setting lPretrained to None, letting it to its original value will bug first + # validation step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + self.save_hyperparameters() + + def forward(self, lX): + _tmp = [] + for lang in sorted(lX.keys()): + doc_embedding = self.transform(lX[lang], lang) + _tmp.append(doc_embedding) + embed = torch.cat(_tmp, dim=0) + logits = self.label(embed) + return logits + + def transform(self, X, lang): + batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def training_step(self, train_batch, batch_idx): + # TODO: double check StepLR scheduler... + lX, ly = train_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + loss = self.loss(logits, ly) + + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + + # microf1 = self.microf1(predictions, ly) + # macrof1 = self.macrof1(predictions, ly) + accuracy = self.accuracy(predictions, ly) + # l_pred = {lang: predictions.detach().cpu().numpy()} + # l_labels = {lang: ly.detach().cpu().numpy()} + # l_eval = evaluate(l_labels, l_pred, n_jobs=1) + + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return loss + + def validation_step(self, val_batch, batch_idx): + lX, ly = val_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + loss = self.loss(logits, ly) + predictions = torch.sigmoid(logits) > 0.5 + # microf1 = self.microf1(predictions, ly) + # macrof1 = self.macrof1(predictions, ly) + accuracy = self.accuracy(predictions, ly) + + # l_pred = {lang: predictions.detach().cpu().numpy()} + # l_labels = {lang: y.detach().cpu().numpy()} + # l_eval = evaluate(l_labels, l_pred, n_jobs=1) + + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return + + def test_step(self, test_batch, batch_idx): + lX, ly = test_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + predictions = torch.sigmoid(logits) > 0.5 + accuracy = self.accuracy(predictions, ly) + custom_metric = self.customMetrics(logits, ly) # TODO + self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-custom', custom_metric, on_step=False, on_epoch=True, prog_bar=False, logger=True) + return {'pred': predictions, 'target': ly} + + def test_epoch_end(self, outputs): + # all_pred = torch.vstack([out['pred'] for out in outputs]) # TODO + # all_y = torch.vstack([out['target'] for out in outputs]) # TODO + # r = eval(all_y, all_pred) + # print(r) + # X = torch.cat(X).view([X[0].shape[0], len(X)]) + return + + def embed(self, X, lang): + input_list = [] + if self.lPretrained_embeddings[lang]: + input_list.append(self.lPretrained_embeddings[lang](X)) + if self.lLearnable_embeddings[lang]: + input_list.append(self.lLearnable_embeddings[lang](X)) + return torch.cat(tensors=input_list, dim=2) + + def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from # length of the supervised embedding + l = X.shape[2] # total embedding length + corr = (1 - p) + X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p) + X /= (1 - (p * m / l)) + return X + + def configure_optimizers(self): + optimizer = AdamW(self.parameters(), lr=1e-3) + scheduler = StepLR(optimizer, step_size=25, gamma=0.5) + return [optimizer], [scheduler] + + +class CustomMetrics(Metric): + def __init__( + self, + num_classes: int, + beta: float = 1.0, + threshold: float = 0.5, + average: str = "micro", + multilabel: bool = False, + compute_on_step: bool = True, + dist_sync_on_step: bool = False, + process_group: Optional[Any] = None, + ): + super().__init__( + compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + ) + + self.num_classes = num_classes + self.beta = beta + self.threshold = threshold + self.average = average + self.multilabel = multilabel + + allowed_average = ("micro", "macro", "weighted", None) + if self.average not in allowed_average: + raise ValueError('Argument `average` expected to be one of the following:' + f' {allowed_average} but got {self.average}') + + self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor): + """ + Update state with predictions and targets. + + Args: + preds: Predictions from model + target: Ground truth values + """ + true_positives, predicted_positives, actual_positives = _fbeta_update( + preds, target, self.num_classes, self.threshold, self.multilabel + ) + + self.true_positives += true_positives + self.predicted_positives += predicted_positives + self.actual_positives += actual_positives + + def compute(self): + """ + Computes metrics over state. + """ + return _fbeta_compute(self.true_positives, self.predicted_positives, + self.actual_positives, self.beta, self.average) + + +def _fbeta_update( + preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + threshold: float = 0.5, + multilabel: bool = False +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + preds, target = _input_format_classification_one_hot( + num_classes, preds, target, threshold, multilabel + ) + true_positives = torch.sum(preds * target, dim=1) + predicted_positives = torch.sum(preds, dim=1) + actual_positives = torch.sum(target, dim=1) + return true_positives, predicted_positives, actual_positives + + +def _fbeta_compute( + true_positives: torch.Tensor, + predicted_positives: torch.Tensor, + actual_positives: torch.Tensor, + beta: float = 1.0, + average: str = "micro" +) -> torch.Tensor: + if average == "micro": + precision = true_positives.sum().float() / predicted_positives.sum() + recall = true_positives.sum().float() / actual_positives.sum() + else: + precision = true_positives.float() / predicted_positives + recall = true_positives.float() / actual_positives + + num = (1 + beta ** 2) * precision * recall + denom = beta ** 2 * precision + recall + new_num = 2 * true_positives + new_fp = predicted_positives - true_positives + new_fn = actual_positives - true_positives + new_den = 2 * true_positives + new_fp + new_fn + if new_den.sum() == 0: + # whats is the correct return type ? TODO + return 1. + return class_reduce(num, denom, weights=actual_positives, class_reduction=average) diff --git a/refactor/run.sh b/refactor/run.sh new file mode 100644 index 0000000..04365f9 --- /dev/null +++ b/refactor/run.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +for i in {0..10..1} +do + python main.py --gpus 0 +done \ No newline at end of file diff --git a/refactor/util/SIF_embed.py b/refactor/util/SIF_embed.py new file mode 100644 index 0000000..cfe096e --- /dev/null +++ b/refactor/util/SIF_embed.py @@ -0,0 +1,56 @@ +import numpy as np +from sklearn.decomposition import TruncatedSVD + +def get_weighted_average(We, x, w): + """ + Compute the weighted average vectors + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in sentence i + :param w: w[i, :] are the weights for the words in sentence i + :return: emb[i, :] are the weighted average vector for sentence i + """ + n_samples = x.shape[0] + emb = np.zeros((n_samples, We.shape[1])) + for i in range(n_samples): + emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) + return emb + +def compute_pc(X,npc=1): + """ + Compute the principal components. + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: component_[i,:] is the i-th pc + """ + svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) + svd.fit(X) + return svd.components_ + +def remove_pc(X, npc=1): + """ + Remove the projection on the principal components + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: XX[i, :] is the data point after removing its projection + """ + pc = compute_pc(X, npc) + if npc==1: + XX = X - X.dot(pc.transpose()) * pc + else: + XX = X - X.dot(pc.transpose()).dot(pc) + return XX + + +def SIF_embedding(We, x, w, params): + """ + Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in the i-th sentence + :param w: w[i, :] are the weights for the words in the i-th sentence + :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component + :return: emb, emb[i, :] is the embedding for sentence i + """ + emb = get_weighted_average(We, x, w) + if params.rmpc > 0: + emb = remove_pc(emb, params.rmpc) + return emb \ No newline at end of file diff --git a/refactor/util/common.py b/refactor/util/common.py new file mode 100644 index 0000000..7792b1c --- /dev/null +++ b/refactor/util/common.py @@ -0,0 +1,322 @@ +import numpy as np +import torch +from tqdm import tqdm +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import normalize +from sklearn.model_selection import train_test_split +from util.embeddings_manager import supervised_embeddings_tfidf + + +class TfidfVectorizerMultilingual: + + def __init__(self, **kwargs): + self.kwargs = kwargs + + def fit(self, lX, ly=None): + self.langs = sorted(lX.keys()) + self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + return self + + def transform(self, lX): + return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} + + def fit_transform(self, lX, ly=None): + return self.fit(lX, ly).transform(lX) + + def vocabulary(self, l=None): + if l is None: + return {l: self.vectorizer[l].vocabulary_ for l in self.langs} + else: + return self.vectorizer[l].vocabulary_ + + def get_analyzer(self, l=None): + if l is None: + return {l: self.vectorizer[l].build_analyzer() for l in self.langs} + else: + return self.vectorizer[l].build_analyzer() + + +def _normalize(lX, l2=True): + return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX + + +def none_dict(langs): + return {l:None for l in langs} + + +class MultilingualIndex: + def __init__(self): + """ + Class that contains monolingual Indexes + """ + self.l_index = {} + self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary=None): + self.langs = sorted(l_devel_raw.keys()) + self.l_vectorizer.fit(l_devel_raw) + l_vocabulary = self.l_vectorizer.vocabulary() + l_analyzer = self.l_vectorizer.get_analyzer() + if l_pretrained_vocabulary is None: + l_pretrained_vocabulary = none_dict(self.langs) + + for lang in self.langs: + # Init monolingual Index + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], lang) + # call to index() function of monolingual Index + self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) + + def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): + for l,index in self.l_index.items(): + index.train_val_split(val_prop, max_val, seed=seed) + + def embedding_matrices(self, lpretrained, supervised): + """ + Extract from pretrained embeddings words that are found in the training dataset, then for each language + calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated + to the unsupervised vectors). + :param lpretrained: dict {lang : matrix of word-embeddings } + :param supervised: bool, whether to deploy Word-Class Embeddings or not + :return: self + """ + lXtr = self.get_lXtr() if supervised else none_dict(self.langs) + lYtr = self.l_train_target() if supervised else none_dict(self.langs) + lWordList = self.get_wordlist() + lExtracted = lpretrained.extract(lWordList) + for lang, index in self.l_index.items(): + # if supervised concatenate embedding matrices of pretrained unsupervised + # and supervised word-class embeddings + index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang]) + self.sup_range = index.wce_range + return self + + def get_wordlist(self): + wordlist = {} + for lang, index in self.l_index.items(): + wordlist[lang] = index.get_word_list() + return wordlist + + def get_raw_lXtr(self): + lXtr_raw = {k:[] for k in self.langs} + lYtr_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXtr_raw[lang] = self.l_index[lang].train_raw + lYtr_raw[lang] = self.l_index[lang].train_raw + return lXtr_raw + + def get_raw_lXva(self): + lXva_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXva_raw[lang] = self.l_index[lang].val_raw + + return lXva_raw + + def get_raw_lXte(self): + lXte_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXte_raw[lang] = self.l_index[lang].test_raw + + return lXte_raw + + def get_lXtr(self): + if not hasattr(self, 'lXtr'): + self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) + return self.lXtr + + def get_lXva(self): + if not hasattr(self, 'lXva'): + self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) + return self.lXva + + def get_lXte(self): + if not hasattr(self, 'lXte'): + self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) + return self.lXte + + def get_target_dim(self): + return self.l_index[self.langs[0]].devel_target.shape[1] + + def l_vocabsize(self): + return {l:index.vocabsize for l,index in self.l_index.items()} + + def l_embeddings(self): + return {l:index.embedding_matrix for l,index in self.l_index.items()} + + def l_pad(self): + return {l: index.pad_index for l, index in self.l_index.items()} + + def l_train_index(self): + return {l: index.train_index for l, index in self.l_index.items()} + + def l_train_raw_index(self): + return {l: index.train_raw for l, index in self.l_index.items()} + + def l_train_target(self): + return {l: index.train_target for l, index in self.l_index.items()} + + def l_val_index(self): + return {l: index.val_index for l, index in self.l_index.items()} + + def l_val_raw_index(self): + return {l: index.val_raw for l, index in self.l_index.items()} + + def l_val_target(self): + return {l: index.val_target for l, index in self.l_index.items()} + + def l_test_index(self): + return {l: index.test_index for l, index in self.l_index.items()} + + def l_test_raw(self): + print('TODO: implement MultilingualIndex method to return RAW test data!') + return NotImplementedError + + def l_devel_index(self): + return {l: index.devel_index for l, index in self.l_index.items()} + + def l_devel_target(self): + return {l: index.devel_target for l, index in self.l_index.items()} + + def l_train(self): + return self.l_train_index(), self.l_train_target() + + def l_val(self): + return self.l_val_index(), self.l_val_target() + + def l_train_raw(self): + return self.l_train_raw_index(), self.l_train_target() + + def l_val_raw(self): + return self.l_val_raw_index(), self.l_val_target() + + def get_l_pad_index(self): + return {l: index.get_pad_index() for l, index in self.l_index.items()} + + +class Index: + def __init__(self, devel_raw, devel_target, test_raw, lang): + """ + Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into + training and validation. + :param devel_raw: list of strings, list of raw training texts + :param devel_target: + :param test_raw: list of strings, list of raw test texts + :param lang: list, list of languages contained in the dataset + """ + self.lang = lang + self.devel_raw = devel_raw + self.devel_target = devel_target + self.test_raw = test_raw + + def index(self, pretrained_vocabulary, analyzer, vocabulary): + self.word2index = dict(vocabulary) + known_words = set(self.word2index.keys()) + if pretrained_vocabulary is not None: + known_words.update(pretrained_vocabulary) + + self.word2index['UNKTOKEN'] = len(self.word2index) + self.word2index['PADTOKEN'] = len(self.word2index) + self.unk_index = self.word2index['UNKTOKEN'] + self.pad_index = self.word2index['PADTOKEN'] + + # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) + self.out_of_vocabulary = dict() + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + + self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) + + print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') + + def get_pad_index(self): + return self.pad_index + + def train_val_split(self, val_prop, max_val, seed): + devel = self.devel_index + target = self.devel_target + devel_raw = self.devel_raw + + val_size = int(min(len(devel) * val_prop, max_val)) + + self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ + train_test_split( + devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) + + print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + + def get_word_list(self): + def extract_word_list(word2index): + return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])] + + word_list = extract_word_list(self.word2index) + word_list += extract_word_list(self.out_of_vocabulary) + return word_list + + def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): + print(f'[generating embedding matrix for lang {self.lang}]') + + self.wce_range = None + embedding_parts = [] + + if pretrained is not None: + print('\t[pretrained-matrix]') + embedding_parts.append(pretrained) + del pretrained + + if supervised: + print('\t[supervised-matrix]') + F = supervised_embeddings_tfidf(Xtr, Ytr) + num_missing_rows = self.vocabsize - F.shape[0] + F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) + F = torch.from_numpy(F).float() + + offset = 0 + if embedding_parts: + offset = embedding_parts[0].shape[1] + self.wce_range = [offset, offset + F.shape[1]] + embedding_parts.append(F) + + self.embedding_matrix = torch.cat(embedding_parts, dim=1) + + print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') + + +def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): + """ + Index (i.e., replaces word strings with numerical indexes) a list of string documents + :param data: list of string documents + :param vocab: a fixed mapping [str]->[int] of words to indexes + :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained + because they are anyway contained in a pre-trained embedding set that we know in advance) + :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words + :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep + :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that + are not in the original vocab but that are in the known_words + :return: + """ + indexes=[] + vocabsize = len(vocab) + unk_count = 0 + knw_count = 0 + out_count = 0 + pbar = tqdm(data, desc=f'indexing') + for text in pbar: + words = analyzer(text) + index = [] + for word in words: + if word in vocab: + idx = vocab[word] + else: + if word in known_words: + if word not in out_of_vocabulary: + out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) + idx = out_of_vocabulary[word] + out_count += 1 + else: + idx = unk_index + unk_count += 1 + index.append(idx) + indexes.append(index) + knw_count += len(index) + # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' + # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') + return indexes diff --git a/refactor/util/embeddings_manager.py b/refactor/util/embeddings_manager.py new file mode 100644 index 0000000..c0aca54 --- /dev/null +++ b/refactor/util/embeddings_manager.py @@ -0,0 +1,102 @@ +from torchtext.vocab import Vectors +import torch +from abc import ABC, abstractmethod +import numpy as np +from util.SIF_embed import remove_pc + + +class PretrainedEmbeddings(ABC): + + def __init__(self): + super().__init__() + + @abstractmethod + def vocabulary(self): pass + + @abstractmethod + def dim(self): pass + + @classmethod + def reindex(cls, words, word2index): + if isinstance(words, dict): + words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] + + source_idx, target_idx = [], [] + for i, word in enumerate(words): + if word not in word2index: + continue + j = word2index[word] + source_idx.append(i) + target_idx.append(j) + source_idx = np.asarray(source_idx) + target_idx = np.asarray(target_idx) + return source_idx, target_idx + + +class MuseLoader: + def __init__(self, langs, cache): + self.langs = langs + self.lEmbed = {} + self.lExtracted = {} + for lang in self.langs: + print(f'Loading vectors for {lang}...') + self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache) + + def dim(self): + return self.lEmbed[list(self.lEmbed.keys())[0]].dim + + def vocabulary(self): + return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs} + + def extract(self, lVoc): + """ + Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes + are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer) + :param lVoc: dict {lang : {word : id}} + :return: torch embedding matrix of extracted embeddings i.e., words in lVoc + """ + for lang, words in lVoc.items(): + print(f'Extracting words for lang {lang}...') + # words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0] + source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi) + extraction = torch.zeros((len(words), self.dim())) + extraction[source_id] = self.lEmbed[lang].vectors[target_id] + self.lExtracted[lang] = extraction + return self.lExtracted + + def get_lEmbeddings(self): + return {lang: self.lEmbed[lang].vectors for lang in self.langs} + + +def XdotM(X, M, sif): + E = X.dot(M) + if sif: + E = remove_pc(E, npc=1) + return E + + +def wce_matrix(X, Y): + wce = supervised_embeddings_tfidf(X, Y) + wce = zscores(wce, axis=0) + return wce + + +def supervised_embeddings_tfidf(X, Y): + tfidf_norm = X.sum(axis=0) + tfidf_norm[tfidf_norm == 0] = 1 + F = (X.T).dot(Y) / tfidf_norm.T + return F + + +def zscores(X, axis=0): + """ + scipy.stats.zscores does not avoid division by 0, which can indeed occur + :param X: + :param axis: + :return: + """ + std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) + mean = np.mean(X, axis=axis) + return (X - mean) / std + + diff --git a/refactor/util/evaluation.py b/refactor/util/evaluation.py new file mode 100644 index 0000000..03c1792 --- /dev/null +++ b/refactor/util/evaluation.py @@ -0,0 +1,19 @@ +from joblib import Parallel, delayed +from util.metrics import * +import numpy as np + + +def evaluation_metrics(y, y_): + if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label + raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') + else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers + return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) + + +def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): + if n_jobs == 1: + return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} + else: + langs = list(ly_true.keys()) + evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) + return {lang: evals[i] for i, lang in enumerate(langs)} diff --git a/refactor/util/file.py b/refactor/util/file.py new file mode 100644 index 0000000..a3d0a3a --- /dev/null +++ b/refactor/util/file.py @@ -0,0 +1,44 @@ +from os import listdir, makedirs +from os.path import isdir, isfile, join, exists, dirname +#from sklearn.externals.six.moves import urllib +import urllib +from pathlib import Path + + +def download_file(url, archive_filename): + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("") + +def download_file_if_not_exists(url, archive_path): + if exists(archive_path): return + makedirs_if_not_exist(dirname(archive_path)) + download_file(url,archive_path) + + +def ls(dir, typecheck): + el = [f for f in listdir(dir) if typecheck(join(dir, f))] + el.sort() + return el + +def list_dirs(dir): + return ls(dir, typecheck=isdir) + +def list_files(dir): + return ls(dir, typecheck=isfile) + +def makedirs_if_not_exist(path): + if not exists(path): makedirs(path) + +def create_if_not_exist(path): + if not exists(path): makedirs(path) + +def get_parent_name(path): + return Path(path).parent + +def get_file_name(path): + return Path(path).name diff --git a/refactor/util/metrics.py b/refactor/util/metrics.py new file mode 100644 index 0000000..7a6079e --- /dev/null +++ b/refactor/util/metrics.py @@ -0,0 +1,152 @@ +import numpy as np + + +class ContTable: + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + def __add__(self, other): + return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) + + +def accuracy(cell): + return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) + + +def f1(cell): + num = 2.0 * cell.tp + den = 2.0 * cell.tp + cell.fp + cell.fn + if den > 0: + return num / den + # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative + return 1.0 + + +def K(cell): + specificity, recall = 0., 0. + + AN = cell.tn + cell.fp + if AN != 0: + specificity = cell.tn*1. / AN + + AP = cell.tp + cell.fn + if AP != 0: + recall = cell.tp*1. / AP + + if AP == 0: + return 2. * specificity - 1. + elif AN == 0: + return 2. * recall - 1. + else: + return specificity + recall - 1. + + +# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared +# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. +def __check_consistency_and_adapt(true_labels, predictions): + if predictions.ndim == 1: + return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) + if true_labels.ndim == 1: + return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions) + if true_labels.shape != predictions.shape: + raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." + % (true_labels.shape, predictions.shape)) + _, nC = true_labels.shape + return true_labels, predictions, nC + + +# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir +# probabilitiesfron with respect to the true binary labels +# true_labels and posterior_probabilities are two vectors of shape (number_documents,) +def soft_single_metric_statistics(true_labels, posterior_probabilities): + assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels." + tp = np.sum(posterior_probabilities[true_labels == 1]) + fn = np.sum(1. - posterior_probabilities[true_labels == 1]) + fp = np.sum(posterior_probabilities[true_labels == 0]) + tn = np.sum(1. - posterior_probabilities[true_labels == 0]) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + + +# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions +# true_labels and predicted_labels are two vectors of shape (number_documents,) +def hard_single_metric_statistics(true_labels, predicted_labels): + assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." + nd = len(true_labels) + tp = np.sum(predicted_labels[true_labels == 1]) + fp = np.sum(predicted_labels[true_labels == 0]) + fn = np.sum(true_labels[predicted_labels == 0]) + tn = nd - (tp+fp+fn) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + + +def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): + true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) + return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) + + +def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): + true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) + + accum = ContTable() + for c in range(nC): + other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) + accum = accum + other + + return metric(accum) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroF1(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def microF1(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroK(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, K) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def microK(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, K) diff --git a/refactor/view_generators.py b/refactor/view_generators.py new file mode 100644 index 0000000..0ea3323 --- /dev/null +++ b/refactor/view_generators.py @@ -0,0 +1,258 @@ +""" +This module contains the view generators that take care of computing the view specific document embeddings: + +- VanillaFunGen (-X) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. + +- WordClassGen (-W): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + +- MuseGen (-M): + +- RecurrentGen (-G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). + +- View generator (-B): generates document embedding via mBERT model. +""" +from abc import ABC, abstractmethod +from models.learners import * +from util.embeddings_manager import MuseLoader, XdotM, wce_matrix +from util.common import TfidfVectorizerMultilingual, _normalize +from models.pl_gru import RecurrentModel +from models.pl_bert import BertModel +from models.lstm_class import RNNMultilingualClassifier +from pytorch_lightning import Trainer +from data.datamodule import GfunDataModule, BertDataModule +from pytorch_lightning.loggers import TensorBoardLogger +import torch + + +class ViewGen(ABC): + @abstractmethod + def fit(self, lX, ly): + pass + + @abstractmethod + def transform(self, lX): + pass + + @abstractmethod + def fit_transform(self, lX, ly): + pass + + +class VanillaFunGen(ViewGen): + def __init__(self, base_learner, n_jobs=-1): + """ + Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 + :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to + return posterior probabilities. + :param n_jobs: integer, number of concurrent workers + """ + super().__init__() + self.learners = base_learner + self.n_jobs = n_jobs + self.doc_projector = NaivePolylingualClassifier(self.learners) + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, lY): + lX = self.vectorizer.fit_transform(lX) + self.doc_projector.fit(lX, lY) + return self + + def transform(self, lX): + lX = self.vectorizer.transform(lX) + lZ = self.doc_projector.predict_proba(lX) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class MuseGen(ViewGen): + def __init__(self, muse_dir='../embeddings', n_jobs=-1): + """ + generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + :param muse_dir: string, path to folder containing muse embeddings + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.muse_dir = muse_dir + self.n_jobs = n_jobs + self.langs = None + self.lMuse = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + self.vectorizer.fit(lX) + self.langs = sorted(lX.keys()) + self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) + lVoc = self.vectorizer.vocabulary() + self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words + # TODO: featureweight.fit + return self + + def transform(self, lX): + lX = self.vectorizer.transform(lX) + XdotMUSE = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) + lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} + lZ = _normalize(lZ, l2=True) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class WordClassGen(ViewGen): + + def __init__(self, n_jobs=-1): + """ + generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.n_jobs = n_jobs + self.langs = None + self.lWce = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + lX = self.vectorizer.fit_transform(lX) + self.langs = sorted(lX.keys()) + wce = Parallel(n_jobs=self.n_jobs)( + delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs) + self.lWce = {l: wce[i] for i, l in enumerate(self.langs)} + # TODO: featureweight.fit() + return self + + def transform(self, lX): + lX = self.vectorizer.transform(lX) + XdotWce = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) + lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)} + lWce = _normalize(lWce, l2=True) + return lWce + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class RecurrentGen(ViewGen): + # TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5 + # Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint). + # if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint() + def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, gpus=0, n_jobs=-1, stored_path=None): + """ + generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). + :param multilingualIndex: + :param pretrained_embeddings: + :param wce: + :param gpus: + :param n_jobs: + """ + super().__init__() + self.multilingualIndex = multilingualIndex + self.langs = multilingualIndex.langs + self.batch_size = batch_size + self.gpus = gpus + self.n_jobs = n_jobs + self.stored_path = stored_path + + # EMBEDDINGS to be deployed + self.pretrained = pretrained_embeddings + self.wce = wce + + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) + self.model = self._init_model() + # hp_tuning with Tensorboard: check https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams + # however, setting it to False at the moment! + self.logger = TensorBoardLogger(save_dir='tb_logs', name='gfun_rnn_dev', default_hp_metric=False) + + def _init_model(self): + if self.stored_path: + lpretrained = self.multilingualIndex.l_embeddings() + return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained) + else: + lpretrained = self.multilingualIndex.l_embeddings() + langs = self.multilingualIndex.langs + output_size = self.multilingualIndex.get_target_dim() + hidden_size = 512 + lvocab_size = self.multilingualIndex.l_vocabsize() + learnable_length = 0 + return RecurrentModel( + lPretrained=lpretrained, + langs=langs, + output_size=output_size, + hidden_size=hidden_size, + lVocab_size=lvocab_size, + learnable_length=learnable_length, + drop_embedding_range=self.multilingualIndex.sup_range, + drop_embedding_prop=0.5 + ) + + def fit(self, lX, ly): + """ + lX and ly are not directly used. We rather get them from the multilingual index used in the instatiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: + :param ly: + :return: + """ + recurrentDataModule = GfunDataModule(self.multilingualIndex, batchsize=self.batch_size) + trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=50) + + # vanilla_torch_model = torch.load( + # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') + # self.model.linear0 = vanilla_torch_model.linear0 + # self.model.linear1 = vanilla_torch_model.linear1 + # self.model.linear2 = vanilla_torch_model.linear2 + # self.model.rnn = vanilla_torch_model.rnn + + trainer.fit(self.model, datamodule=recurrentDataModule) + trainer.test(self.model, datamodule=recurrentDataModule) + return self + + def transform(self, lX): + pass + + def fit_transform(self, lX, ly): + pass + + +class BertGen(ViewGen): + + def __init__(self, multilingualIndex, batch_size=128, gpus=0, n_jobs=-1, stored_path=None): + super().__init__() + self.multilingualIndex = multilingualIndex + self.gpus = gpus + self.batch_size = batch_size + self.n_jobs = n_jobs + self.stored_path = stored_path + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert_dev', default_hp_metric=False) + self.model = self._init_model() + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + + def _init_model(self): + output_size = self.multilingualIndex.get_target_dim() + return BertModel(output_size=output_size, stored_path=self.stored_path) + + def fit(self, lX, ly): + bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) + trainer = Trainer(default_root_dir='checkpoints/bert/', gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger) + trainer.fit(self.model, bertDataModule) + # trainer.test(self.model, bertDataModule) + pass + + def transform(self, lX): + pass + + def fit_transform(self, lX, ly): + pass + + diff --git a/test.py b/test.py deleted file mode 100644 index 3fbc4f8..0000000 --- a/test.py +++ /dev/null @@ -1 +0,0 @@ -# preparing refactor From 34676167e84ae02596e14e4173321cde3d423a45 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 19 Jan 2021 13:11:16 +0100 Subject: [PATCH 09/55] cleared up folders --- refactor/devel_ideas.py | 95 ----------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 refactor/devel_ideas.py diff --git a/refactor/devel_ideas.py b/refactor/devel_ideas.py deleted file mode 100644 index bf5690a..0000000 --- a/refactor/devel_ideas.py +++ /dev/null @@ -1,95 +0,0 @@ -class CustomMetrics(Metric): - def __init__( - self, - num_classes: int, - beta: float = 1.0, - threshold: float = 0.5, - average: str = "micro", - multilabel: bool = False, - compute_on_step: bool = True, - dist_sync_on_step: bool = False, - process_group: Optional[Any] = None, - ): - super().__init__( - compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, - ) - - self.num_classes = num_classes - self.beta = beta - self.threshold = threshold - self.average = average - self.multilabel = multilabel - - allowed_average = ("micro", "macro", "weighted", None) - if self.average not in allowed_average: - raise ValueError('Argument `average` expected to be one of the following:' - f' {allowed_average} but got {self.average}') - - self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") - self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") - self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") - - def update(self, preds: torch.Tensor, target: torch.Tensor): - """ - Update state with predictions and targets. - - Args: - preds: Predictions from model - target: Ground truth values - """ - true_positives, predicted_positives, actual_positives = _fbeta_update( - preds, target, self.num_classes, self.threshold, self.multilabel - ) - - self.true_positives += true_positives - self.predicted_positives += predicted_positives - self.actual_positives += actual_positives - - def compute(self): - """ - Computes metrics over state. - """ - return _fbeta_compute(self.true_positives, self.predicted_positives, - self.actual_positives, self.beta, self.average) - - -def _fbeta_update( - preds: torch.Tensor, - target: torch.Tensor, - num_classes: int, - threshold: float = 0.5, - multilabel: bool = False -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - preds, target = _input_format_classification_one_hot( - num_classes, preds, target, threshold, multilabel - ) - true_positives = torch.sum(preds * target, dim=1) - predicted_positives = torch.sum(preds, dim=1) - actual_positives = torch.sum(target, dim=1) - return true_positives, predicted_positives, actual_positives - - -def _fbeta_compute( - true_positives: torch.Tensor, - predicted_positives: torch.Tensor, - actual_positives: torch.Tensor, - beta: float = 1.0, - average: str = "micro" -) -> torch.Tensor: - if average == "micro": - precision = true_positives.sum().float() / predicted_positives.sum() - recall = true_positives.sum().float() / actual_positives.sum() - else: - precision = true_positives.float() / predicted_positives - recall = true_positives.float() / actual_positives - - num = (1 + beta ** 2) * precision * recall - denom = beta ** 2 * precision + recall - new_num = 2 * true_positives - new_fp = predicted_positives - true_positives - new_fn = actual_positives - true_positives - new_den = 2 * true_positives + new_fp + new_fn - if new_den.sum() == 0: - # whats is the correct return type ? TODO - return 1. - return class_reduce(num, denom, weights=actual_positives, class_reduction=average) From 294d7c3be72415c054e3ea899f6fe159ed83cfc7 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 19 Jan 2021 15:30:15 +0100 Subject: [PATCH 10/55] refactor --- refactor/data/datamodule.py | 12 ++---- refactor/main.py | 13 +++---- refactor/models/helpers.py | 14 ++++--- refactor/models/pl_gru.py | 73 ++++++------------------------------- refactor/util/common.py | 13 +++++-- refactor/view_generators.py | 2 +- 6 files changed, 42 insertions(+), 85 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index bbb7cc1..67a83d6 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -103,7 +103,6 @@ class GfunDataModule(pl.LightningDataModule): pass def setup(self, stage=None): - # Assign train/val datasets for use in dataloaders if stage == 'fit' or stage is None: l_train_index, l_train_target = self.multilingualIndex.l_train() self.training_dataset = RecurrentDataset(l_train_index, l_train_target, @@ -111,9 +110,8 @@ class GfunDataModule(pl.LightningDataModule): l_val_index, l_val_target = self.multilingualIndex.l_val() self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) - # Assign test dataset for use in dataloader(s) if stage == 'test' or stage is None: - l_test_index, l_test_target = self.multilingualIndex.l_val() + l_test_index, l_test_target = self.multilingualIndex.l_test() self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) @@ -136,7 +134,6 @@ class BertDataModule(GfunDataModule): self.max_len = max_len def setup(self, stage=None): - # Assign train/val datasets for use in dataloaders if stage == 'fit' or stage is None: l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) @@ -146,12 +143,11 @@ class BertDataModule(GfunDataModule): l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) - # Assign test dataset for use in dataloader(s) # TODO if stage == 'test' or stage is None: - l_val_raw, l_val_target = self.multilingualIndex.l_test_raw() - l_val_index = self.tokenize(l_val_raw) - self.test_dataset = RecurrentDataset(l_val_index, l_val_target, + l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() + l_test_index = self.tokenize(l_val_raw, max_len=self.max_len) + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) @staticmethod diff --git a/refactor/main.py b/refactor/main.py index 76c5e54..42ef9c9 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -7,29 +7,28 @@ from util.common import MultilingualIndex def main(args): N_JOBS = 8 - print('Running...') + print('Running refactored...') # _DATASET = '/homenfs/a.pedrotti1/datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' # EMBEDDINGS_PATH = '/homenfs/a.pedrotti1/embeddings/MUSE' _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' - EMBEDDINGS_PATH = '/home/andreapdr/funneling_pdr/embeddings' + EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - # data.set_view(languages=['it']) + data.set_view(languages=['it'], categories=[0,1]) lX, ly = data.training() lXte, lyte = data.test() - # Init multilingualIndex - mandatory when deploying Neural View Generators... + # Init multilingualIndex - mandatory when deploying Neural View Generators... multilingualIndex = MultilingualIndex() # lMuse = MuseLoader(langs=sorted(lX.keys()), cache=) lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) - multilingualIndex.index(lX, ly, lXte, l_pretrained_vocabulary=lMuse.vocabulary()) + multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, gpus=args.gpus, n_jobs=N_JOBS, - stored_path='/home/andreapdr/gfun_refactor/tb_logs/gfun_rnn_dev/version_19/checkpoints/epoch=0-step=14.ckpt') + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=512, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/helpers.py b/refactor/models/helpers.py index 93e5805..b466f28 100755 --- a/refactor/models/helpers.py +++ b/refactor/models/helpers.py @@ -3,25 +3,29 @@ import torch.nn as nn from torch.nn import functional as F - -def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): +def init_embeddings(pretrained, vocab_size, learnable_length): + """ + Compute the embedding matrix + :param pretrained: + :param vocab_size: + :param learnable_length: + :return: + """ pretrained_embeddings = None pretrained_length = 0 if pretrained is not None: pretrained_length = pretrained.shape[1] assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + # requires_grad=False sets the embedding layer as NOT trainable pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - # pretrained_embeddings.to(device) learnable_embeddings = None if learnable_length > 0: learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - # learnable_embeddings.to(device) embedding_length = learnable_length + pretrained_length assert embedding_length > 0, '0-size embeddings' - return pretrained_embeddings, learnable_embeddings, embedding_length diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 7987156..268a694 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -1,43 +1,17 @@ +# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html import torch from torch import nn -from torch.optim import Adam from transformers import AdamW import torch.nn.functional as F from torch.autograd import Variable import pytorch_lightning as pl from pytorch_lightning.metrics import F1, Accuracy, Metric from torch.optim.lr_scheduler import StepLR - -from util.evaluation import evaluate from typing import Any, Optional, Tuple from pytorch_lightning.metrics.utils import _input_format_classification_one_hot, class_reduce +from models.helpers import init_embeddings import numpy as np - - -def init_embeddings(pretrained, vocab_size, learnable_length): - """ - Compute the embedding matrix - :param pretrained: - :param vocab_size: - :param learnable_length: - :return: - """ - pretrained_embeddings = None - pretrained_length = 0 - if pretrained is not None: - pretrained_length = pretrained.shape[1] - assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' - pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) - # requires_grad=False sets the embedding layer as NOT trainable - pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - - learnable_embeddings = None - if learnable_length > 0: - learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - - embedding_length = learnable_length + pretrained_length - assert embedding_length > 0, '0-size embeddings' - return pretrained_embeddings, learnable_embeddings, embedding_length +from util.evaluation import evaluate class RecurrentModel(pl.LightningModule): @@ -97,7 +71,7 @@ class RecurrentModel(pl.LightningModule): self.label = nn.Linear(ff2, self.output_size) lPretrained = None # TODO: setting lPretrained to None, letting it to its original value will bug first - # validation step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + # validation step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) self.save_hyperparameters() def forward(self, lX): @@ -124,7 +98,6 @@ class RecurrentModel(pl.LightningModule): return output def training_step(self, train_batch, batch_idx): - # TODO: double check StepLR scheduler... lX, ly = train_batch logits = self.forward(lX) _ly = [] @@ -132,20 +105,14 @@ class RecurrentModel(pl.LightningModule): _ly.append(ly[lang]) ly = torch.cat(_ly, dim=0) loss = self.loss(logits, ly) - # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 - - # microf1 = self.microf1(predictions, ly) - # macrof1 = self.macrof1(predictions, ly) accuracy = self.accuracy(predictions, ly) - # l_pred = {lang: predictions.detach().cpu().numpy()} - # l_labels = {lang: ly.detach().cpu().numpy()} - # l_eval = evaluate(l_labels, l_pred, n_jobs=1) - - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + custom = self.customMetrics(predictions, ly) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return loss + self.log('custom', custom, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return {'loss': loss} def validation_step(self, val_batch, batch_idx): lX, ly = val_batch @@ -156,17 +123,10 @@ class RecurrentModel(pl.LightningModule): ly = torch.cat(_ly, dim=0) loss = self.loss(logits, ly) predictions = torch.sigmoid(logits) > 0.5 - # microf1 = self.microf1(predictions, ly) - # macrof1 = self.macrof1(predictions, ly) accuracy = self.accuracy(predictions, ly) - - # l_pred = {lang: predictions.detach().cpu().numpy()} - # l_labels = {lang: y.detach().cpu().numpy()} - # l_eval = evaluate(l_labels, l_pred, n_jobs=1) - - self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return + return {'loss': loss} def test_step(self, test_batch, batch_idx): lX, ly = test_batch @@ -177,18 +137,9 @@ class RecurrentModel(pl.LightningModule): ly = torch.cat(_ly, dim=0) predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - custom_metric = self.customMetrics(logits, ly) # TODO self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-custom', custom_metric, on_step=False, on_epoch=True, prog_bar=False, logger=True) - return {'pred': predictions, 'target': ly} - - def test_epoch_end(self, outputs): - # all_pred = torch.vstack([out['pred'] for out in outputs]) # TODO - # all_y = torch.vstack([out['target'] for out in outputs]) # TODO - # r = eval(all_y, all_pred) - # print(r) - # X = torch.cat(X).view([X[0].shape[0], len(X)]) return + # return {'pred': predictions, 'target': ly} def embed(self, X, lang): input_list = [] @@ -308,5 +259,5 @@ def _fbeta_compute( new_den = 2 * true_positives + new_fp + new_fn if new_den.sum() == 0: # whats is the correct return type ? TODO - return 1. + return class_reduce(new_num, new_den, weights=actual_positives, class_reduction=average) return class_reduce(num, denom, weights=actual_positives, class_reduction=average) diff --git a/refactor/util/common.py b/refactor/util/common.py index 7792b1c..4bd0c20 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -52,7 +52,7 @@ class MultilingualIndex: self.l_index = {} self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary=None): + def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None): self.langs = sorted(l_devel_raw.keys()) self.l_vectorizer.fit(l_devel_raw) l_vocabulary = self.l_vectorizer.vocabulary() @@ -62,7 +62,7 @@ class MultilingualIndex: for lang in self.langs: # Init monolingual Index - self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], lang) + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], lang) # call to index() function of monolingual Index self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) @@ -163,6 +163,9 @@ class MultilingualIndex: def l_val_target(self): return {l: index.val_target for l, index in self.l_index.items()} + def l_test_target(self): + return {l: index.test_target for l, index in self.l_index.items()} + def l_test_index(self): return {l: index.test_index for l, index in self.l_index.items()} @@ -182,6 +185,9 @@ class MultilingualIndex: def l_val(self): return self.l_val_index(), self.l_val_target() + def l_test(self): + return self.l_test_index(), self.l_test_target() + def l_train_raw(self): return self.l_train_raw_index(), self.l_train_target() @@ -193,7 +199,7 @@ class MultilingualIndex: class Index: - def __init__(self, devel_raw, devel_target, test_raw, lang): + def __init__(self, devel_raw, devel_target, test_raw, test_target, lang): """ Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into training and validation. @@ -206,6 +212,7 @@ class Index: self.devel_raw = devel_raw self.devel_target = devel_target self.test_raw = test_raw + self.test_target = test_target def index(self, pretrained_vocabulary, analyzer, vocabulary): self.word2index = dict(vocabulary) diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 0ea3323..abe2442 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -205,7 +205,7 @@ class RecurrentGen(ViewGen): :return: """ recurrentDataModule = GfunDataModule(self.multilingualIndex, batchsize=self.batch_size) - trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=50) + trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=50, checkpoint_callback=False) # vanilla_torch_model = torch.load( # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') From 8dbe48ff7a454aed9c28757757089ed6634a1418 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 11:47:51 +0100 Subject: [PATCH 11/55] Implemented custom micro F1 in pl (cpu and gpu) --- refactor/data/datamodule.py | 13 +++- refactor/main.py | 5 +- refactor/models/pl_gru.py | 150 ++++++++++++------------------------ refactor/util/common.py | 9 +++ refactor/view_generators.py | 14 ++-- 5 files changed, 83 insertions(+), 108 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 67a83d6..29020dc 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -88,7 +88,7 @@ class RecurrentDataset(Dataset): return index_list -class GfunDataModule(pl.LightningDataModule): +class RecurrentDataModule(pl.LightningDataModule): def __init__(self, multilingualIndex, batchsize=64): """ Pytorch-lightning DataModule: https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html @@ -105,9 +105,18 @@ class GfunDataModule(pl.LightningDataModule): def setup(self, stage=None): if stage == 'fit' or stage is None: l_train_index, l_train_target = self.multilingualIndex.l_train() + + # l_train_index = {l: train[:50] for l, train in l_train_index.items()} + # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) + l_val_index, l_val_target = self.multilingualIndex.l_val() + + # l_val_index = {l: train[:50] for l, train in l_val_index.items()} + # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) if stage == 'test' or stage is None: @@ -128,7 +137,7 @@ class GfunDataModule(pl.LightningDataModule): collate_fn=self.test_dataset.collate_fn) -class BertDataModule(GfunDataModule): +class BertDataModule(RecurrentDataModule): def __init__(self, multilingualIndex, batchsize=64, max_len=512): super().__init__(multilingualIndex, batchsize) self.max_len = max_len diff --git a/refactor/main.py b/refactor/main.py index 42ef9c9..45487b1 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -15,7 +15,7 @@ def main(args): _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - data.set_view(languages=['it'], categories=[0,1]) + data.set_view(languages=['it'], categories=[0, 1]) lX, ly = data.training() lXte, lyte = data.test() @@ -28,7 +28,8 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=512, gpus=args.gpus, n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=5, nepochs=100, + gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 268a694..2e3ecf1 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -5,12 +5,10 @@ from transformers import AdamW import torch.nn.functional as F from torch.autograd import Variable import pytorch_lightning as pl -from pytorch_lightning.metrics import F1, Accuracy, Metric +from pytorch_lightning.metrics import Metric, F1, Accuracy from torch.optim.lr_scheduler import StepLR -from typing import Any, Optional, Tuple -from pytorch_lightning.metrics.utils import _input_format_classification_one_hot, class_reduce from models.helpers import init_embeddings -import numpy as np +from util.common import is_true, is_false from util.evaluation import evaluate @@ -20,8 +18,9 @@ class RecurrentModel(pl.LightningModule): """ def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, - drop_embedding_range, drop_embedding_prop, lMuse_debug=None, multilingual_index_debug=None): + drop_embedding_range, drop_embedding_prop, gpus=None): super().__init__() + self.gpus = gpus self.langs = langs self.lVocab_size = lVocab_size self.learnable_length = learnable_length @@ -33,7 +32,7 @@ class RecurrentModel(pl.LightningModule): self.microf1 = F1(num_classes=output_size, multilabel=True, average='micro') self.macrof1 = F1(num_classes=output_size, multilabel=True, average='macro') self.accuracy = Accuracy() - self.customMetrics = CustomMetrics(num_classes=output_size, multilabel=True, average='micro') + self.customMetrics = CustomF1(num_classes=output_size, device=self.gpus) self.lPretrained_embeddings = nn.ModuleDict() self.lLearnable_embeddings = nn.ModuleDict() @@ -42,10 +41,6 @@ class RecurrentModel(pl.LightningModule): self.n_directions = 1 self.dropout = nn.Dropout(0.6) - # TODO: debug setting - self.lMuse = lMuse_debug - self.multilingual_index_debug = multilingual_index_debug - lstm_out = 256 ff1 = 512 ff2 = 256 @@ -111,7 +106,7 @@ class RecurrentModel(pl.LightningModule): custom = self.customMetrics(predictions, ly) self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('custom', custom, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('custom', custom, on_step=True, on_epoch=True, prog_bar=True, logger=True) return {'loss': loss} def validation_step(self, val_batch, batch_idx): @@ -139,7 +134,6 @@ class RecurrentModel(pl.LightningModule): accuracy = self.accuracy(predictions, ly) self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) return - # return {'pred': predictions, 'target': ly} def embed(self, X, lang): input_list = [] @@ -166,98 +160,56 @@ class RecurrentModel(pl.LightningModule): return [optimizer], [scheduler] -class CustomMetrics(Metric): - def __init__( - self, - num_classes: int, - beta: float = 1.0, - threshold: float = 0.5, - average: str = "micro", - multilabel: bool = False, - compute_on_step: bool = True, - dist_sync_on_step: bool = False, - process_group: Optional[Any] = None, - ): - super().__init__( - compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, - ) - +class CustomF1(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + Custom F1 metric. + Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. + I.e., when the number of true positives, false positives, and false negatives amount to 0, all + affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. + We adhere to the common practice of outputting 1 in this case since the classifier has correctly + classified all examples as negatives. + :param num_classes: + :param device: + :param average: + """ + super().__init__() self.num_classes = num_classes - self.beta = beta - self.threshold = threshold self.average = average - self.multilabel = multilabel + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) - allowed_average = ("micro", "macro", "weighted", None) - if self.average not in allowed_average: - raise ValueError('Argument `average` expected to be one of the following:' - f' {allowed_average} but got {self.average}') + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = self._update(preds, target) - self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") - self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") - self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative - def update(self, preds: torch.Tensor, target: torch.Tensor): - """ - Update state with predictions and targets. + def _update(self, pred, target): + assert pred.shape == target.shape + # preparing preds and targets for count + true_pred = is_true(pred, self.device) + false_pred = is_false(pred, self.device) + true_target = is_true(target, self.device) + false_target = is_false(target, self.device) - Args: - preds: Predictions from model - target: Ground truth values - """ - true_positives, predicted_positives, actual_positives = _fbeta_update( - preds, target, self.num_classes, self.threshold, self.multilabel - ) - - self.true_positives += true_positives - self.predicted_positives += predicted_positives - self.actual_positives += actual_positives + tp = torch.sum(true_pred * true_target, dim=0) + tn = torch.sum(false_pred * false_target, dim=0) + fp = torch.sum(true_pred * false_target, dim=0) + fn = torch.sum(false_pred * target, dim=0) + return tp, tn, fp, fn def compute(self): - """ - Computes metrics over state. - """ - return _fbeta_compute(self.true_positives, self.predicted_positives, - self.actual_positives, self.beta, self.average) - - -def _fbeta_update( - preds: torch.Tensor, - target: torch.Tensor, - num_classes: int, - threshold: float = 0.5, - multilabel: bool = False -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - preds, target = _input_format_classification_one_hot( - num_classes, preds, target, threshold, multilabel - ) - true_positives = torch.sum(preds * target, dim=1) - predicted_positives = torch.sum(preds, dim=1) - actual_positives = torch.sum(target, dim=1) - return true_positives, predicted_positives, actual_positives - - -def _fbeta_compute( - true_positives: torch.Tensor, - predicted_positives: torch.Tensor, - actual_positives: torch.Tensor, - beta: float = 1.0, - average: str = "micro" -) -> torch.Tensor: - if average == "micro": - precision = true_positives.sum().float() / predicted_positives.sum() - recall = true_positives.sum().float() / actual_positives.sum() - else: - precision = true_positives.float() / predicted_positives - recall = true_positives.float() / actual_positives - - num = (1 + beta ** 2) * precision * recall - denom = beta ** 2 * precision + recall - new_num = 2 * true_positives - new_fp = predicted_positives - true_positives - new_fn = actual_positives - true_positives - new_den = 2 * true_positives + new_fp + new_fn - if new_den.sum() == 0: - # whats is the correct return type ? TODO - return class_reduce(new_num, new_den, weights=actual_positives, class_reduction=average) - return class_reduce(num, denom, weights=actual_positives, class_reduction=average) + if self.average == 'micro': + num = 2.0 * self.true_positive.sum() + den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() + if den > 0: + return (num / den).to(self.device) + return torch.FloatTensor([1.]).to(self.device) + if self.average == 'macro': + raise NotImplementedError diff --git a/refactor/util/common.py b/refactor/util/common.py index 4bd0c20..c6f6610 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -327,3 +327,12 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') return indexes + + +def is_true(tensor, device): + return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) + + +def is_false(tensor, device): + return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) + diff --git a/refactor/view_generators.py b/refactor/view_generators.py index abe2442..9ea91fa 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -22,7 +22,7 @@ from models.pl_gru import RecurrentModel from models.pl_bert import BertModel from models.lstm_class import RNNMultilingualClassifier from pytorch_lightning import Trainer -from data.datamodule import GfunDataModule, BertDataModule +from data.datamodule import RecurrentDataModule, BertDataModule from pytorch_lightning.loggers import TensorBoardLogger import torch @@ -144,7 +144,8 @@ class RecurrentGen(ViewGen): # TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5 # Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint). # if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint() - def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, gpus=0, n_jobs=-1, stored_path=None): + def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, + gpus=0, n_jobs=-1, stored_path=None): """ generates document embedding by means of a Gated Recurrent Units. The model can be initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). @@ -162,6 +163,7 @@ class RecurrentGen(ViewGen): self.gpus = gpus self.n_jobs = n_jobs self.stored_path = stored_path + self.nepochs = nepochs # EMBEDDINGS to be deployed self.pretrained = pretrained_embeddings @@ -193,7 +195,8 @@ class RecurrentGen(ViewGen): lVocab_size=lvocab_size, learnable_length=learnable_length, drop_embedding_range=self.multilingualIndex.sup_range, - drop_embedding_prop=0.5 + drop_embedding_prop=0.5, + gpus=self.gpus ) def fit(self, lX, ly): @@ -204,8 +207,9 @@ class RecurrentGen(ViewGen): :param ly: :return: """ - recurrentDataModule = GfunDataModule(self.multilingualIndex, batchsize=self.batch_size) - trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=50, checkpoint_callback=False) + recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size) + trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, + checkpoint_callback=False) # vanilla_torch_model = torch.load( # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') From 91666bd26331c909d687cfc0948b7d1dd5b3c0d9 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 12:13:51 +0100 Subject: [PATCH 12/55] Implemented custom micro and macro F1 in pl (cpu and gpu) --- refactor/main.py | 4 ++-- refactor/models/pl_gru.py | 28 ++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/refactor/main.py b/refactor/main.py index 45487b1..bea0067 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -15,7 +15,7 @@ def main(args): _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - data.set_view(languages=['it'], categories=[0, 1]) + # data.set_view(languages=['it'], categories=[0, 1]) lX, ly = data.training() lXte, lyte = data.test() @@ -28,7 +28,7 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=5, nepochs=100, + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 2e3ecf1..1ed8314 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -29,10 +29,11 @@ class RecurrentModel(pl.LightningModule): self.drop_embedding_range = drop_embedding_range self.drop_embedding_prop = drop_embedding_prop self.loss = torch.nn.BCEWithLogitsLoss() - self.microf1 = F1(num_classes=output_size, multilabel=True, average='micro') - self.macrof1 = F1(num_classes=output_size, multilabel=True, average='macro') + # self.microf1 = F1(num_classes=output_size, multilabel=True, average='micro') + # self.macrof1 = F1(num_classes=output_size, multilabel=True, average='macro') self.accuracy = Accuracy() - self.customMetrics = CustomF1(num_classes=output_size, device=self.gpus) + self.customMicroF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.customMacroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.lPretrained_embeddings = nn.ModuleDict() self.lLearnable_embeddings = nn.ModuleDict() @@ -103,10 +104,12 @@ class RecurrentModel(pl.LightningModule): # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - custom = self.customMetrics(predictions, ly) + microF1 = self.customMicroF1(predictions, ly) + macroF1 = self.customMacroF1(predictions, ly) self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('custom', custom, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) return {'loss': loss} def validation_step(self, val_batch, batch_idx): @@ -212,4 +215,17 @@ class CustomF1(Metric): return (num / den).to(self.device) return torch.FloatTensor([1.]).to(self.device) if self.average == 'macro': - raise NotImplementedError + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + # class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + num = 2.0 * class_tp + den = 2.0 * class_tp + class_fp + class_fn + if den > 0: + class_specific.append(num / den) + else: + class_specific.append(1.) + average = torch.sum(torch.Tensor(class_specific))/self.num_classes + return average.to(self.device) From 7c73aa214903f8cc76a41ef6dbec25b6912d111d Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 12:24:55 +0100 Subject: [PATCH 13/55] Implemented custom micro and macro F1 in pl (cpu and gpu) --- refactor/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refactor/main.py b/refactor/main.py index bea0067..e44adcd 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -28,7 +28,7 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, nepochs=50, + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) From a60e2cfc0952cdb99afcbcf00f196794536b0392 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 14:55:09 +0100 Subject: [PATCH 14/55] Implemented custom micro and macro F1 in pl (cpu and gpu) --- refactor/main.py | 4 +- refactor/models/pl_gru.py | 105 +++++++++--------------------------- refactor/util/pl_metrics.py | 71 ++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 83 deletions(-) create mode 100644 refactor/util/pl_metrics.py diff --git a/refactor/main.py b/refactor/main.py index e44adcd..8791d6d 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -28,8 +28,8 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, nepochs=100, - gpus=args.gpus, n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, + nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 1ed8314..690843d 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -5,10 +5,10 @@ from transformers import AdamW import torch.nn.functional as F from torch.autograd import Variable import pytorch_lightning as pl -from pytorch_lightning.metrics import Metric, F1, Accuracy +from pytorch_lightning.metrics import F1, Accuracy from torch.optim.lr_scheduler import StepLR from models.helpers import init_embeddings -from util.common import is_true, is_false +from util.pl_metrics import CustomF1 from util.evaluation import evaluate @@ -29,11 +29,14 @@ class RecurrentModel(pl.LightningModule): self.drop_embedding_range = drop_embedding_range self.drop_embedding_prop = drop_embedding_prop self.loss = torch.nn.BCEWithLogitsLoss() - # self.microf1 = F1(num_classes=output_size, multilabel=True, average='micro') - # self.macrof1 = F1(num_classes=output_size, multilabel=True, average='macro') + self.accuracy = Accuracy() - self.customMicroF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.customMacroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1_tr = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1_tr = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1_va = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1_va = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1_te = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1_te = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.lPretrained_embeddings = nn.ModuleDict() self.lLearnable_embeddings = nn.ModuleDict() @@ -104,12 +107,12 @@ class RecurrentModel(pl.LightningModule): # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - microF1 = self.customMicroF1(predictions, ly) - macroF1 = self.customMacroF1(predictions, ly) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + microF1 = self.microF1_tr(predictions, ly) + macroF1 = self.macroF1_tr(predictions, ly) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) return {'loss': loss} def validation_step(self, val_batch, batch_idx): @@ -122,8 +125,12 @@ class RecurrentModel(pl.LightningModule): loss = self.loss(logits, ly) predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + microF1 = self.microF1_va(predictions, ly) + macroF1 = self.macroF1_va(predictions, ly) + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) return {'loss': loss} def test_step(self, test_batch, batch_idx): @@ -135,7 +142,11 @@ class RecurrentModel(pl.LightningModule): ly = torch.cat(_ly, dim=0) predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) + microF1 = self.microF1_te(predictions, ly) + macroF1 = self.macroF1_te(predictions, ly) + self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) return def embed(self, X, lang): @@ -161,71 +172,3 @@ class RecurrentModel(pl.LightningModule): optimizer = AdamW(self.parameters(), lr=1e-3) scheduler = StepLR(optimizer, step_size=25, gamma=0.5) return [optimizer], [scheduler] - - -class CustomF1(Metric): - def __init__(self, num_classes, device, average='micro'): - """ - Custom F1 metric. - Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. - I.e., when the number of true positives, false positives, and false negatives amount to 0, all - affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. - We adhere to the common practice of outputting 1 in this case since the classifier has correctly - classified all examples as negatives. - :param num_classes: - :param device: - :param average: - """ - super().__init__() - self.num_classes = num_classes - self.average = average - self.device = 'cuda' if device else 'cpu' - self.add_state('true_positive', default=torch.zeros(self.num_classes)) - self.add_state('true_negative', default=torch.zeros(self.num_classes)) - self.add_state('false_positive', default=torch.zeros(self.num_classes)) - self.add_state('false_negative', default=torch.zeros(self.num_classes)) - - def update(self, preds, target): - true_positive, true_negative, false_positive, false_negative = self._update(preds, target) - - self.true_positive += true_positive - self.true_negative += true_negative - self.false_positive += false_positive - self.false_negative += false_negative - - def _update(self, pred, target): - assert pred.shape == target.shape - # preparing preds and targets for count - true_pred = is_true(pred, self.device) - false_pred = is_false(pred, self.device) - true_target = is_true(target, self.device) - false_target = is_false(target, self.device) - - tp = torch.sum(true_pred * true_target, dim=0) - tn = torch.sum(false_pred * false_target, dim=0) - fp = torch.sum(true_pred * false_target, dim=0) - fn = torch.sum(false_pred * target, dim=0) - return tp, tn, fp, fn - - def compute(self): - if self.average == 'micro': - num = 2.0 * self.true_positive.sum() - den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() - if den > 0: - return (num / den).to(self.device) - return torch.FloatTensor([1.]).to(self.device) - if self.average == 'macro': - class_specific = [] - for i in range(self.num_classes): - class_tp = self.true_positive[i] - # class_tn = self.true_negative[i] - class_fp = self.false_positive[i] - class_fn = self.false_negative[i] - num = 2.0 * class_tp - den = 2.0 * class_tp + class_fp + class_fn - if den > 0: - class_specific.append(num / den) - else: - class_specific.append(1.) - average = torch.sum(torch.Tensor(class_specific))/self.num_classes - return average.to(self.device) diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py new file mode 100644 index 0000000..a54bacb --- /dev/null +++ b/refactor/util/pl_metrics.py @@ -0,0 +1,71 @@ +import torch +from pytorch_lightning.metrics import Metric +from util.common import is_false, is_true + + +class CustomF1(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + Custom F1 metric. + Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. + I.e., when the number of true positives, false positives, and false negatives amount to 0, all + affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. + We adhere to the common practice of outputting 1 in this case since the classifier has correctly + classified all examples as negatives. + :param num_classes: + :param device: + :param average: + """ + super().__init__() + self.num_classes = num_classes + self.average = average + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) + + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = self._update(preds, target) + + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative + + def _update(self, pred, target): + assert pred.shape == target.shape + # preparing preds and targets for count + true_pred = is_true(pred, self.device) + false_pred = is_false(pred, self.device) + true_target = is_true(target, self.device) + false_target = is_false(target, self.device) + + tp = torch.sum(true_pred * true_target, dim=0) + tn = torch.sum(false_pred * false_target, dim=0) + fp = torch.sum(true_pred * false_target, dim=0) + fn = torch.sum(false_pred * target, dim=0) + return tp, tn, fp, fn + + def compute(self): + if self.average == 'micro': + num = 2.0 * self.true_positive.sum() + den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() + if den > 0: + return (num / den).to(self.device) + return torch.FloatTensor([1.]).to(self.device) + if self.average == 'macro': + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + num = 2.0 * class_tp + den = 2.0 * class_tp + class_fp + class_fn + if den > 0: + class_specific.append(num / den) + else: + class_specific.append(1.) + average = torch.sum(torch.Tensor(class_specific))/self.num_classes + return average.to(self.device) From faa387f6965aad2f50dd83cae4c60d656a10a310 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 14:56:06 +0100 Subject: [PATCH 15/55] Implemented custom micro and macro F1 in pl (cpu and gpu) --- refactor/models/pl_gru.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 690843d..c810220 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -69,8 +69,9 @@ class RecurrentModel(pl.LightningModule): self.linear2 = nn.Linear(ff1, ff2) self.label = nn.Linear(ff2, self.output_size) - lPretrained = None # TODO: setting lPretrained to None, letting it to its original value will bug first - # validation step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + # TODO: setting lPretrained to None, letting it to its original value will bug first validation + # step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + lPretrained = None self.save_hyperparameters() def forward(self, lX): From d6eeabe6abd17545031fe4c1f341dc0e85cb5349 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 14:57:31 +0100 Subject: [PATCH 16/55] Implemented custom micro and macro F1 in pl (cpu and gpu) + various TODO --- refactor/models/pl_gru.py | 1 + 1 file changed, 1 insertion(+) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index c810220..9883b92 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -11,6 +11,7 @@ from models.helpers import init_embeddings from util.pl_metrics import CustomF1 from util.evaluation import evaluate +# TODO: it should also be possible to compute metrics independently for each language! class RecurrentModel(pl.LightningModule): """ From 6ed7712979bb06f012722c9bf21671e17d8fc1b8 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 20 Jan 2021 15:13:39 +0100 Subject: [PATCH 17/55] Implemented custom micro and macro F1 in pl (cpu and gpu) + various TODO --- refactor/data/datamodule.py | 15 +++++++---- refactor/main.py | 8 +++--- refactor/models/pl_bert.py | 50 +++++++++++++++++++++++++++++++------ refactor/models/pl_gru.py | 5 ++-- refactor/util/common.py | 2 +- refactor/view_generators.py | 8 +++--- 6 files changed, 65 insertions(+), 23 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 29020dc..621bee5 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -105,18 +105,16 @@ class RecurrentDataModule(pl.LightningDataModule): def setup(self, stage=None): if stage == 'fit' or stage is None: l_train_index, l_train_target = self.multilingualIndex.l_train() - + # Debug settings: reducing number of samples # l_train_index = {l: train[:50] for l, train in l_train_index.items()} # l_train_target = {l: target[:50] for l, target in l_train_target.items()} - self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) l_val_index, l_val_target = self.multilingualIndex.l_val() - + # Debug settings: reducing number of samples # l_val_index = {l: train[:50] for l, train in l_val_index.items()} # l_val_target = {l: target[:50] for l, target in l_val_target.items()} - self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) if stage == 'test' or stage is None: @@ -145,14 +143,21 @@ class BertDataModule(RecurrentDataModule): def setup(self, stage=None): if stage == 'fit' or stage is None: l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() + # Debug settings: reducing number of samples + # l_train_raw = {l: train[:50] for l, train in l_train_raw.items()} + # l_train_target = {l: target[:50] for l, target in l_train_target.items()} l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) + l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() + # Debug settings: reducing number of samples + # l_val_raw = {l: train[:50] for l, train in l_val_raw.items()} + # l_val_target = {l: target[:50] for l, target in l_val_target.items()} l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) - # TODO + if stage == 'test' or stage is None: l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() l_test_index = self.tokenize(l_val_raw, max_len=self.max_len) diff --git a/refactor/main.py b/refactor/main.py index 8791d6d..eb48cb1 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -15,7 +15,7 @@ def main(args): _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - # data.set_view(languages=['it'], categories=[0, 1]) + data.set_view(languages=['it'], categories=[0, 1]) lX, ly = data.training() lXte, lyte = data.test() @@ -28,9 +28,9 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, - nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) - # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) + # gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, + # nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) + gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 4561004..61c2748 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -1,15 +1,25 @@ import torch import pytorch_lightning as pl from torch.optim.lr_scheduler import StepLR -from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig -from pytorch_lightning.metrics import F1, Accuracy, Metric +from transformers import BertForSequenceClassification, AdamW +from pytorch_lightning.metrics import Accuracy +from util.pl_metrics import CustomF1 class BertModel(pl.LightningModule): - def __init__(self, output_size, stored_path): + def __init__(self, output_size, stored_path, gpus=None): super().__init__() self.loss = torch.nn.BCEWithLogitsLoss() + self.gpus = gpus + self.accuracy = Accuracy() + self.microF1_tr = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1_tr = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1_va = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1_va = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1_te = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1_te = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + if stored_path: self.bert = BertForSequenceClassification.from_pretrained(stored_path, num_labels=output_size, @@ -18,7 +28,6 @@ class BertModel(pl.LightningModule): self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=output_size, output_hidden_states=True) - self.accuracy = Accuracy() self.save_hyperparameters() def forward(self, X): @@ -31,11 +40,16 @@ class BertModel(pl.LightningModule): y = y.type(torch.cuda.FloatTensor) logits, _ = self.forward(X) loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, y) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + microF1 = self.microF1_tr(predictions, y) + macroF1 = self.macroF1_tr(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return loss + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return {'loss': loss} def validation_step(self, val_batch, batch_idx): X, y, _, batch_langs = val_batch @@ -45,9 +59,29 @@ class BertModel(pl.LightningModule): loss = self.loss(logits, y) predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, y) - self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + microF1 = self.microF1_va(predictions, y) + macroF1 = self.macroF1_va(predictions, y) + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return {'loss': loss} + + # def test_step(self, test_batch, batch_idx): + # lX, ly = test_batch + # logits = self.forward(lX) + # _ly = [] + # for lang in sorted(lX.keys()): + # _ly.append(ly[lang]) + # ly = torch.cat(_ly, dim=0) + # predictions = torch.sigmoid(logits) > 0.5 + # accuracy = self.accuracy(predictions, ly) + # microF1 = self.microF1_te(predictions, ly) + # macroF1 = self.macroF1_te(predictions, ly) + # self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) + # self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + # self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + # return def configure_optimizers(self, lr=3e-5, weight_decay=0.01): no_decay = ['bias', 'LayerNorm.weight'] diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 9883b92..a0584f2 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -1,18 +1,19 @@ # Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html import torch from torch import nn -from transformers import AdamW import torch.nn.functional as F from torch.autograd import Variable +from torch.optim.lr_scheduler import StepLR +from transformers import AdamW import pytorch_lightning as pl from pytorch_lightning.metrics import F1, Accuracy -from torch.optim.lr_scheduler import StepLR from models.helpers import init_embeddings from util.pl_metrics import CustomF1 from util.evaluation import evaluate # TODO: it should also be possible to compute metrics independently for each language! + class RecurrentModel(pl.LightningModule): """ Check out for logging insight https://www.learnopencv.com/tensorboard-with-pytorch-lightning/ diff --git a/refactor/util/common.py b/refactor/util/common.py index c6f6610..d24707a 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -171,7 +171,7 @@ class MultilingualIndex: def l_test_raw(self): print('TODO: implement MultilingualIndex method to return RAW test data!') - return NotImplementedError + return {l: index.test_raw for l, index in self.l_index.items()} def l_devel_index(self): return {l: index.devel_index for l, index in self.l_index.items()} diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 9ea91fa..5628da2 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -231,9 +231,10 @@ class RecurrentGen(ViewGen): class BertGen(ViewGen): - def __init__(self, multilingualIndex, batch_size=128, gpus=0, n_jobs=-1, stored_path=None): + def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): super().__init__() self.multilingualIndex = multilingualIndex + self.nepochs = nepochs self.gpus = gpus self.batch_size = batch_size self.n_jobs = n_jobs @@ -244,11 +245,12 @@ class BertGen(ViewGen): def _init_model(self): output_size = self.multilingualIndex.get_target_dim() - return BertModel(output_size=output_size, stored_path=self.stored_path) + return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) def fit(self, lX, ly): bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) - trainer = Trainer(default_root_dir='checkpoints/bert/', gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger) + trainer = Trainer(default_root_dir='checkpoints/bert/', gradient_clip_val=1e-1, max_epochs=self.nepochs, + gpus=self.gpus, logger=self.logger, checkpoint_callback=False) trainer.fit(self.model, bertDataModule) # trainer.test(self.model, bertDataModule) pass From 5ce1203942417d012097d0f92fdb6b98060a831a Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 21 Jan 2021 10:13:03 +0100 Subject: [PATCH 18/55] Implemented micro and macro K in pl (cpu and gpu) --- refactor/main.py | 6 +-- refactor/models/pl_gru.py | 38 ++++++++------ refactor/util/pl_metrics.py | 99 +++++++++++++++++++++++++++++++------ 3 files changed, 109 insertions(+), 34 deletions(-) diff --git a/refactor/main.py b/refactor/main.py index eb48cb1..2c88f7d 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -28,9 +28,9 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - # gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, - # nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) - gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, + nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) + # gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index a0584f2..0fe5c6a 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -6,9 +6,9 @@ from torch.autograd import Variable from torch.optim.lr_scheduler import StepLR from transformers import AdamW import pytorch_lightning as pl -from pytorch_lightning.metrics import F1, Accuracy +from pytorch_lightning.metrics import Accuracy from models.helpers import init_embeddings -from util.pl_metrics import CustomF1 +from util.pl_metrics import CustomF1, CustomK from util.evaluation import evaluate # TODO: it should also be possible to compute metrics independently for each language! @@ -33,12 +33,10 @@ class RecurrentModel(pl.LightningModule): self.loss = torch.nn.BCEWithLogitsLoss() self.accuracy = Accuracy() - self.microF1_tr = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1_tr = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microF1_va = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1_va = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microF1_te = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1_te = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) + self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) self.lPretrained_embeddings = nn.ModuleDict() self.lLearnable_embeddings = nn.ModuleDict() @@ -110,12 +108,16 @@ class RecurrentModel(pl.LightningModule): # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - microF1 = self.microF1_tr(predictions, ly) - macroF1 = self.macroF1_tr(predictions, ly) + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) return {'loss': loss} def validation_step(self, val_batch, batch_idx): @@ -128,12 +130,16 @@ class RecurrentModel(pl.LightningModule): loss = self.loss(logits, ly) predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - microF1 = self.microF1_va(predictions, ly) - macroF1 = self.macroF1_va(predictions, ly) - self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) + self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('val-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) return {'loss': loss} def test_step(self, test_batch, batch_idx): @@ -145,8 +151,8 @@ class RecurrentModel(pl.LightningModule): ly = torch.cat(_ly, dim=0) predictions = torch.sigmoid(logits) > 0.5 accuracy = self.accuracy(predictions, ly) - microF1 = self.microF1_te(predictions, ly) - macroF1 = self.macroF1_te(predictions, ly) + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py index a54bacb..6781d09 100644 --- a/refactor/util/pl_metrics.py +++ b/refactor/util/pl_metrics.py @@ -3,6 +3,21 @@ from pytorch_lightning.metrics import Metric from util.common import is_false, is_true +def _update(pred, target, device): + assert pred.shape == target.shape + # preparing preds and targets for count + true_pred = is_true(pred, device) + false_pred = is_false(pred, device) + true_target = is_true(target, device) + false_target = is_false(target, device) + + tp = torch.sum(true_pred * true_target, dim=0) + tn = torch.sum(false_pred * false_target, dim=0) + fp = torch.sum(true_pred * false_target, dim=0) + fn = torch.sum(false_pred * target, dim=0) + return tp, tn, fp, fn + + class CustomF1(Metric): def __init__(self, num_classes, device, average='micro'): """ @@ -26,27 +41,13 @@ class CustomF1(Metric): self.add_state('false_negative', default=torch.zeros(self.num_classes)) def update(self, preds, target): - true_positive, true_negative, false_positive, false_negative = self._update(preds, target) + true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) self.true_positive += true_positive self.true_negative += true_negative self.false_positive += false_positive self.false_negative += false_negative - def _update(self, pred, target): - assert pred.shape == target.shape - # preparing preds and targets for count - true_pred = is_true(pred, self.device) - false_pred = is_false(pred, self.device) - true_target = is_true(target, self.device) - false_target = is_false(target, self.device) - - tp = torch.sum(true_pred * true_target, dim=0) - tn = torch.sum(false_pred * false_target, dim=0) - fp = torch.sum(true_pred * false_target, dim=0) - fn = torch.sum(false_pred * target, dim=0) - return tp, tn, fp, fn - def compute(self): if self.average == 'micro': num = 2.0 * self.true_positive.sum() @@ -69,3 +70,71 @@ class CustomF1(Metric): class_specific.append(1.) average = torch.sum(torch.Tensor(class_specific))/self.num_classes return average.to(self.device) + + +class CustomK(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + K metric. https://dl.acm.org/doi/10.1145/2808194.2809449 + :param num_classes: + :param device: + :param average: + """ + super().__init__() + self.num_classes = num_classes + self.average = average + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) + + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) + + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative + + def compute(self): + if self.average == 'micro': + specificity, recall = 0., 0. + absolute_negatives = self.true_negative.sum() + self.false_positive.sum() + if absolute_negatives != 0: + specificity = self.true_negative.sum()/absolute_negatives # Todo check if it is float + absolute_positives = self.true_positive.sum() + self.false_negative.sum() + if absolute_positives != 0: + recall = self.true_positive.sum()/absolute_positives # Todo check if it is float + + if absolute_positives == 0: + return 2. * specificity - 1 + elif absolute_negatives == 0: + return 2. * recall - 1 + else: + return specificity + recall - 1 + + if self.average == 'macro': + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + + specificity, recall = 0., 0. + absolute_negatives = class_tn + class_fp + if absolute_negatives != 0: + specificity = class_tn / absolute_negatives # Todo check if it is float + absolute_positives = class_tp + class_fn + if absolute_positives != 0: + recall = class_tp / absolute_positives # Todo check if it is float + + if absolute_positives == 0: + class_specific.append(2. * specificity - 1) + elif absolute_negatives == 0: + class_specific.append(2. * recall - 1) + else: + class_specific.append(specificity + recall - 1) + average = torch.sum(torch.Tensor(class_specific)) / self.num_classes + return average.to(self.device) From 472b64ee0ebb588d037424fa2f086c2ce46e68dd Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 21 Jan 2021 15:41:56 +0100 Subject: [PATCH 19/55] Implemented metrics logging --- refactor/data/datamodule.py | 4 ++ refactor/main.py | 6 +-- refactor/models/pl_gru.py | 91 +++++++++++++++++++++++++++---------- refactor/util/common.py | 27 ++++++----- 4 files changed, 89 insertions(+), 39 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 621bee5..c87b0de 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -108,6 +108,7 @@ class RecurrentDataModule(pl.LightningDataModule): # Debug settings: reducing number of samples # l_train_index = {l: train[:50] for l, train in l_train_index.items()} # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) @@ -115,6 +116,7 @@ class RecurrentDataModule(pl.LightningDataModule): # Debug settings: reducing number of samples # l_val_index = {l: train[:50] for l, train in l_val_index.items()} # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) if stage == 'test' or stage is None: @@ -146,6 +148,7 @@ class BertDataModule(RecurrentDataModule): # Debug settings: reducing number of samples # l_train_raw = {l: train[:50] for l, train in l_train_raw.items()} # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) @@ -154,6 +157,7 @@ class BertDataModule(RecurrentDataModule): # Debug settings: reducing number of samples # l_val_raw = {l: train[:50] for l, train in l_val_raw.items()} # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) diff --git a/refactor/main.py b/refactor/main.py index 2c88f7d..a9840a1 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -15,7 +15,7 @@ def main(args): _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - data.set_view(languages=['it'], categories=[0, 1]) + # data.set_view(languages=['it', 'fr']) lX, ly = data.training() lXte, lyte = data.test() @@ -28,8 +28,8 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, - nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=128, + nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 0fe5c6a..411e438 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -6,21 +6,25 @@ from torch.autograd import Variable from torch.optim.lr_scheduler import StepLR from transformers import AdamW import pytorch_lightning as pl -from pytorch_lightning.metrics import Accuracy from models.helpers import init_embeddings from util.pl_metrics import CustomF1, CustomK -from util.evaluation import evaluate - -# TODO: it should also be possible to compute metrics independently for each language! class RecurrentModel(pl.LightningModule): - """ - Check out for logging insight https://www.learnopencv.com/tensorboard-with-pytorch-lightning/ - """ - def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, drop_embedding_range, drop_embedding_prop, gpus=None): + """ + + :param lPretrained: + :param langs: + :param output_size: + :param hidden_size: + :param lVocab_size: + :param learnable_length: + :param drop_embedding_range: + :param drop_embedding_prop: + :param gpus: + """ super().__init__() self.gpus = gpus self.langs = langs @@ -32,11 +36,16 @@ class RecurrentModel(pl.LightningModule): self.drop_embedding_prop = drop_embedding_prop self.loss = torch.nn.BCEWithLogitsLoss() - self.accuracy = Accuracy() self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics - I am not really sure if they should be initialized + # independently or we can use the metrics init above... # TODO: check it + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.lPretrained_embeddings = nn.ModuleDict() self.lLearnable_embeddings = nn.ModuleDict() @@ -103,22 +112,60 @@ class RecurrentModel(pl.LightningModule): _ly = [] for lang in sorted(lX.keys()): _ly.append(ly[lang]) - ly = torch.cat(_ly, dim=0) - loss = self.loss(logits, ly) + y = torch.cat(_ly, dim=0) + loss = self.loss(logits, y) # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, ly) - microF1 = self.microF1(predictions, ly) - macroF1 = self.macroF1(predictions, ly) - microK = self.microK(predictions, ly) - macroK = self.macroK(predictions, ly) + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return {'loss': loss} + re_lX = self._reconstruct_dict(predictions, ly) + return {'loss': loss, 'pred': re_lX, 'target': ly} + + def _reconstruct_dict(self, X, ly): + reconstructed = {} + _start = 0 + for lang in sorted(ly.keys()): + lang_batchsize = len(ly[lang]) + reconstructed[lang] = X[_start:_start+lang_batchsize] + _start += lang_batchsize + return reconstructed + + def training_epoch_end(self, outputs): + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + res_macroF1 = {lang: [] for lang in self.langs} + res_microF1 = {lang: [] for lang in self.langs} + res_macroK = {lang: [] for lang in self.langs} + res_microK = {lang: [] for lang in self.langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in self.langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) def validation_step(self, val_batch, batch_idx): lX, ly = val_batch @@ -129,13 +176,11 @@ class RecurrentModel(pl.LightningModule): ly = torch.cat(_ly, dim=0) loss = self.loss(logits, ly) predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, ly) microF1 = self.microF1(predictions, ly) macroF1 = self.macroF1(predictions, ly) microK = self.microK(predictions, ly) macroK = self.macroK(predictions, ly) self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('val-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) @@ -150,12 +195,10 @@ class RecurrentModel(pl.LightningModule): _ly.append(ly[lang]) ly = torch.cat(_ly, dim=0) predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, ly) microF1 = self.microF1(predictions, ly) macroF1 = self.macroF1(predictions, ly) - self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=False) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=False) return def embed(self, X, lang): diff --git a/refactor/util/common.py b/refactor/util/common.py index d24707a..f5ec1a9 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -41,7 +41,7 @@ def _normalize(lX, l2=True): def none_dict(langs): - return {l:None for l in langs} + return {l: None for l in langs} class MultilingualIndex: @@ -62,12 +62,13 @@ class MultilingualIndex: for lang in self.langs: # Init monolingual Index - self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], lang) + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], + lang) # call to index() function of monolingual Index self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l,index in self.l_index.items(): + for l, index in self.l_index.items(): index.train_val_split(val_prop, max_val, seed=seed) def embedding_matrices(self, lpretrained, supervised): @@ -97,7 +98,7 @@ class MultilingualIndex: return wordlist def get_raw_lXtr(self): - lXtr_raw = {k:[] for k in self.langs} + lXtr_raw = {k: [] for k in self.langs} lYtr_raw = {k: [] for k in self.langs} for lang in self.langs: lXtr_raw[lang] = self.l_index[lang].train_raw @@ -137,10 +138,10 @@ class MultilingualIndex: return self.l_index[self.langs[0]].devel_target.shape[1] def l_vocabsize(self): - return {l:index.vocabsize for l,index in self.l_index.items()} + return {l: index.vocabsize for l, index in self.l_index.items()} def l_embeddings(self): - return {l:index.embedding_matrix for l,index in self.l_index.items()} + return {l: index.embedding_matrix for l, index in self.l_index.items()} def l_pad(self): return {l: index.pad_index for l, index in self.l_index.items()} @@ -227,8 +228,10 @@ class Index: # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) @@ -248,7 +251,8 @@ class Index: train_test_split( devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) - print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + print( + f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') def get_word_list(self): def extract_word_list(word2index): @@ -300,7 +304,7 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): are not in the original vocab but that are in the known_words :return: """ - indexes=[] + indexes = [] vocabsize = len(vocab) unk_count = 0 knw_count = 0 @@ -315,7 +319,7 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): else: if word in known_words: if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) + out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary) idx = out_of_vocabulary[word] out_count += 1 else: @@ -335,4 +339,3 @@ def is_true(tensor, device): def is_false(tensor, device): return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) - From 4d3ef41a07068ed3c07031454056830808b88cfa Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 22 Jan 2021 16:23:38 +0100 Subject: [PATCH 20/55] Implementing inference functions --- refactor/data/datamodule.py | 5 ++-- refactor/debug_notebook.ipynb | 36 ------------------------- refactor/main.py | 15 ++++++----- refactor/models/pl_gru.py | 49 ++++++++++++++++++++++++++++++----- refactor/util/common.py | 14 ++++++++++ refactor/view_generators.py | 43 +++++++++++++++++++----------- 6 files changed, 95 insertions(+), 67 deletions(-) delete mode 100644 refactor/debug_notebook.ipynb diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index c87b0de..13319f7 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -65,9 +65,8 @@ class RecurrentDataset(Dataset): ly_batch[current_lang].append(d[1]) for lang in lX_batch.keys(): - # TODO: double check padding function (too many left pad tokens?) - lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], max_pad_length=70) - # max_pad_length=self.define_pad_length(lX_batch[lang])) + lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], + max_pad_length=self.define_pad_length(lX_batch[lang])) lX_batch[lang] = torch.LongTensor(lX_batch[lang]) ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) diff --git a/refactor/debug_notebook.ipynb b/refactor/debug_notebook.ipynb deleted file mode 100644 index f574694..0000000 --- a/refactor/debug_notebook.ipynb +++ /dev/null @@ -1,36 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/refactor/main.py b/refactor/main.py index a9840a1..ec2dc60 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -3,6 +3,7 @@ from util.embeddings_manager import MuseLoader from view_generators import RecurrentGen, BertGen from data.dataset_builder import MultilingualDataset from util.common import MultilingualIndex +from time import time def main(args): @@ -21,23 +22,23 @@ def main(args): # Init multilingualIndex - mandatory when deploying Neural View Generators... multilingualIndex = MultilingualIndex() - # lMuse = MuseLoader(langs=sorted(lX.keys()), cache=) lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=128, + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) - gFun.fit(lX, ly) + time_init = time() + # gFun.fit(lX, ly) - # print('Projecting...') - # y_ = gFun.transform(lX) - - exit('Executed!') + print('Projecting...') + y_ = gFun.transform(lX) + train_time = round(time() - time_init, 3) + exit(f'Executed! Training time: {train_time}!') if __name__ == '__main__': diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 411e438..8e474dc 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -8,6 +8,7 @@ from transformers import AdamW import pytorch_lightning as pl from models.helpers import init_embeddings from util.pl_metrics import CustomF1, CustomK +from util.common import define_pad_length, pad class RecurrentModel(pl.LightningModule): @@ -78,17 +79,17 @@ class RecurrentModel(pl.LightningModule): self.linear2 = nn.Linear(ff1, ff2) self.label = nn.Linear(ff2, self.output_size) - # TODO: setting lPretrained to None, letting it to its original value will bug first validation + # TODO: setting lPretrained to None, letting it to its original value will "bug" first validation # step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) lPretrained = None self.save_hyperparameters() def forward(self, lX): - _tmp = [] + l_embed = [] for lang in sorted(lX.keys()): doc_embedding = self.transform(lX[lang], lang) - _tmp.append(doc_embedding) - embed = torch.cat(_tmp, dim=0) + l_embed.append(doc_embedding) + embed = torch.cat(l_embed, dim=0) logits = self.label(embed) return logits @@ -106,6 +107,37 @@ class RecurrentModel(pl.LightningModule): output = self.dropout(F.relu(self.linear2(output))) return output + def encode(self, lX, l_pad, batch_size=128): + """ + Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. + :param lX: + :return: + """ + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i+batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i+batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) + X = torch.LongTensor(batch) + _batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + l_embed[lang].append(output) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0) + return l_embed + def training_step(self, train_batch, batch_idx): lX, ly = train_batch logits = self.forward(lX) @@ -140,6 +172,7 @@ class RecurrentModel(pl.LightningModule): def training_epoch_end(self, outputs): # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language + # TODO: this is horrible... res_macroF1 = {lang: [] for lang in self.langs} res_microF1 = {lang: [] for lang in self.langs} res_macroK = {lang: [] for lang in self.langs} @@ -197,8 +230,12 @@ class RecurrentModel(pl.LightningModule): predictions = torch.sigmoid(logits) > 0.5 microF1 = self.microF1(predictions, ly) macroF1 = self.macroF1(predictions, ly) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=False) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=False) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) return def embed(self, X, lang): diff --git a/refactor/util/common.py b/refactor/util/common.py index f5ec1a9..88e4630 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -339,3 +339,17 @@ def is_true(tensor, device): def is_false(tensor, device): return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) + + +def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + +def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list \ No newline at end of file diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 5628da2..a403368 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -20,11 +20,10 @@ from util.embeddings_manager import MuseLoader, XdotM, wce_matrix from util.common import TfidfVectorizerMultilingual, _normalize from models.pl_gru import RecurrentModel from models.pl_bert import BertModel -from models.lstm_class import RNNMultilingualClassifier from pytorch_lightning import Trainer from data.datamodule import RecurrentDataModule, BertDataModule -from pytorch_lightning.loggers import TensorBoardLogger -import torch +from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger +from time import time class ViewGen(ABC): @@ -172,9 +171,8 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - # hp_tuning with Tensorboard: check https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams - # however, setting it to False at the moment! - self.logger = TensorBoardLogger(save_dir='tb_logs', name='gfun_rnn_dev', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn_dev', default_hp_metric=False) + # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') def _init_model(self): if self.stored_path: @@ -201,7 +199,7 @@ class RecurrentGen(ViewGen): def fit(self, lX, ly): """ - lX and ly are not directly used. We rather get them from the multilingual index used in the instatiation + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation of the Dataset object (RecurrentDataset) in the GfunDataModule class. :param lX: :param ly: @@ -223,7 +221,20 @@ class RecurrentGen(ViewGen): return self def transform(self, lX): - pass + """ + Project documents to the common latent space + :param lX: + :return: + """ + l_pad = self.multilingualIndex.l_pad() + data = self.multilingualIndex.l_devel_index() + # trainer = Trainer(gpus=self.gpus) + # self.model.eval() + time_init = time() + l_embeds = self.model.encode(data, l_pad, batch_size=256) + transform_time = round(time() - time_init, 3) + print(f'Executed! Transform took: {transform_time}') + return l_embeds def fit_transform(self, lX, ly): pass @@ -239,26 +250,28 @@ class BertGen(ViewGen): self.batch_size = batch_size self.n_jobs = n_jobs self.stored_path = stored_path - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert_dev', default_hp_metric=False) self.model = self._init_model() - self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert_dev', default_hp_metric=False) def _init_model(self): output_size = self.multilingualIndex.get_target_dim() return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) def fit(self, lX, ly): + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) - trainer = Trainer(default_root_dir='checkpoints/bert/', gradient_clip_val=1e-1, max_epochs=self.nepochs, - gpus=self.gpus, logger=self.logger, checkpoint_callback=False) - trainer.fit(self.model, bertDataModule) - # trainer.test(self.model, bertDataModule) - pass + trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, + logger=self.logger, checkpoint_callback=False) + trainer.fit(self.model, datamodule=bertDataModule) + trainer.test(self.model, datamodule=bertDataModule) + return self def transform(self, lX): + # lX is raw text data. It has to be first indexed via multilingualIndex Vectorizer. pass def fit_transform(self, lX, ly): + # we can assume that we have already indexed data for transform() since we are first calling fit() pass From 9af9347531d019bad5a691ff90a869da483c7af8 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 22 Jan 2021 16:54:56 +0100 Subject: [PATCH 21/55] Implementing inference functions --- refactor/models/pl_gru.py | 52 ++++++++++++++++++++----------------- refactor/view_generators.py | 2 +- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 8e474dc..c81f959 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -110,33 +110,37 @@ class RecurrentModel(pl.LightningModule): def encode(self, lX, l_pad, batch_size=128): """ Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. + # TODO: does not run on gpu.. :param lX: + :param l_pad: + :param batch_size: :return: """ - l_embed = {lang: [] for lang in lX.keys()} - for lang in sorted(lX.keys()): - for i in range(0, len(lX[lang]), batch_size): - if i+batch_size > len(lX[lang]): - batch = lX[lang][i:len(lX[lang])] - else: - batch = lX[lang][i:i+batch_size] - max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) - X = torch.LongTensor(batch) - _batch_size = X.shape[0] - X = self.embed(X, lang) - X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - X = X.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) - output, _ = self.rnn(X, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - l_embed[lang].append(output) - for k, v in l_embed.items(): - l_embed[k] = torch.cat(v, dim=0) - return l_embed + with torch.no_grad(): + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i+batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i+batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) + X = torch.LongTensor(batch) + _batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + l_embed[lang].append(output) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0).cpu().numpy() + return l_embed def training_step(self, train_batch, batch_idx): lX, ly = train_batch diff --git a/refactor/view_generators.py b/refactor/view_generators.py index a403368..d5f7ce8 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -229,7 +229,7 @@ class RecurrentGen(ViewGen): l_pad = self.multilingualIndex.l_pad() data = self.multilingualIndex.l_devel_index() # trainer = Trainer(gpus=self.gpus) - # self.model.eval() + self.model.eval() time_init = time() l_embeds = self.model.encode(data, l_pad, batch_size=256) transform_time = round(time() - time_init, 3) From 01bd85d15659c49a56540931a8a90c3052fda46f Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 22 Jan 2021 18:00:41 +0100 Subject: [PATCH 22/55] Implementing inference functions --- refactor/main.py | 2 +- refactor/models/pl_gru.py | 2 +- refactor/view_generators.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/refactor/main.py b/refactor/main.py index ec2dc60..610defe 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -16,7 +16,7 @@ def main(args): _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - # data.set_view(languages=['it', 'fr']) + data.set_view(languages=['it', 'fr']) lX, ly = data.training() lXte, lyte = data.test() diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index c81f959..ed70e80 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -126,7 +126,7 @@ class RecurrentModel(pl.LightningModule): batch = lX[lang][i:i+batch_size] max_pad_len = define_pad_length(batch) batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) - X = torch.LongTensor(batch) + X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') _batch_size = X.shape[0] X = self.embed(X, lang) X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, diff --git a/refactor/view_generators.py b/refactor/view_generators.py index d5f7ce8..8f1f191 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -229,6 +229,7 @@ class RecurrentGen(ViewGen): l_pad = self.multilingualIndex.l_pad() data = self.multilingualIndex.l_devel_index() # trainer = Trainer(gpus=self.gpus) + self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() time_init = time() l_embeds = self.model.encode(data, l_pad, batch_size=256) From 6e0b66e13e20d1cfa1e2bf8141acb93970d78320 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 12:28:58 +0100 Subject: [PATCH 23/55] Implemented inference functions for bert (cpu and gpu) --- refactor/data/datamodule.py | 10 +-- refactor/main.py | 13 ++-- refactor/models/pl_bert.py | 150 ++++++++++++++++++++++++++---------- refactor/models/pl_gru.py | 2 +- refactor/util/common.py | 10 ++- refactor/view_generators.py | 10 ++- 6 files changed, 135 insertions(+), 60 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 13319f7..c80fc84 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -105,16 +105,16 @@ class RecurrentDataModule(pl.LightningDataModule): if stage == 'fit' or stage is None: l_train_index, l_train_target = self.multilingualIndex.l_train() # Debug settings: reducing number of samples - # l_train_index = {l: train[:50] for l, train in l_train_index.items()} - # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + l_train_index = {l: train[:50] for l, train in l_train_index.items()} + l_train_target = {l: target[:50] for l, target in l_train_target.items()} self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) l_val_index, l_val_target = self.multilingualIndex.l_val() # Debug settings: reducing number of samples - # l_val_index = {l: train[:50] for l, train in l_val_index.items()} - # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + l_val_index = {l: train[:50] for l, train in l_val_index.items()} + l_val_target = {l: target[:50] for l, target in l_val_target.items()} self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) @@ -163,7 +163,7 @@ class BertDataModule(RecurrentDataModule): if stage == 'test' or stage is None: l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() - l_test_index = self.tokenize(l_val_raw, max_len=self.max_len) + l_test_index = self.tokenize(l_test_raw, max_len=self.max_len) self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) diff --git a/refactor/main.py b/refactor/main.py index 610defe..bb71bd1 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -28,15 +28,16 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, - nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) - # gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) + # gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, + # nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) + gFun = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) time_init = time() - # gFun.fit(lX, ly) + gFun.fit(lX, ly) + + # print('Projecting...') + # y_ = gFun.transform(lX) - print('Projecting...') - y_ = gFun.transform(lX) train_time = round(time() - time_init, 3) exit(f'Executed! Training time: {train_time}!') diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 61c2748..7503a47 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -2,23 +2,31 @@ import torch import pytorch_lightning as pl from torch.optim.lr_scheduler import StepLR from transformers import BertForSequenceClassification, AdamW -from pytorch_lightning.metrics import Accuracy -from util.pl_metrics import CustomF1 +from util.pl_metrics import CustomF1, CustomK class BertModel(pl.LightningModule): def __init__(self, output_size, stored_path, gpus=None): + """ + Init Bert model. + :param output_size: + :param stored_path: + :param gpus: + """ super().__init__() self.loss = torch.nn.BCEWithLogitsLoss() self.gpus = gpus - self.accuracy = Accuracy() - self.microF1_tr = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1_tr = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microF1_va = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1_va = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microF1_te = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1_te = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) + self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics - I am not really sure if they should be initialized + # independently or we can use the metrics init above... # TODO: check it + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) if stored_path: self.bert = BertForSequenceClassification.from_pretrained(stored_path, @@ -37,51 +45,111 @@ class BertModel(pl.LightningModule): def training_step(self, train_batch, batch_idx): X, y, _, batch_langs = train_batch X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.cuda.FloatTensor) + # y = y.type(torch.cuda.FloatTensor) + y = y.type(torch.FloatTensor) + y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) loss = self.loss(logits, y) # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, y) - microF1 = self.microF1_tr(predictions, y) - macroF1 = self.macroF1_tr(predictions, y) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return {'loss': loss} + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + lX, ly = self._reconstruct_dict(predictions, y, batch_langs) + return {'loss': loss, 'pred': lX, 'target': ly} + + def _reconstruct_dict(self, predictions, y, batch_langs): + reconstructed_x = {lang: [] for lang in set(batch_langs)} + reconstructed_y = {lang: [] for lang in set(batch_langs)} + for i, pred in enumerate(predictions): + reconstructed_x[batch_langs[i]].append(pred) + reconstructed_y[batch_langs[i]].append(y[i]) + for k, v in reconstructed_x.items(): + reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) + for k, v in reconstructed_y.items(): + reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) + return reconstructed_x, reconstructed_y + + def training_epoch_end(self, outputs): + langs = [] + for output in outputs: + langs.extend(list(output['pred'].keys())) + langs = set(langs) + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + # TODO: this is horrible... + res_macroF1 = {lang: [] for lang in langs} + res_microF1 = {lang: [] for lang in langs} + res_macroK = {lang: [] for lang in langs} + res_microK = {lang: [] for lang in langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) def validation_step(self, val_batch, batch_idx): X, y, _, batch_langs = val_batch X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.cuda.FloatTensor) + # y = y.type(torch.cuda.FloatTensor) + y = y.type(torch.FloatTensor) + y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) loss = self.loss(logits, y) predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, y) - microF1 = self.microF1_va(predictions, y) - macroF1 = self.macroF1_va(predictions, y) - self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) return {'loss': loss} - # def test_step(self, test_batch, batch_idx): - # lX, ly = test_batch - # logits = self.forward(lX) - # _ly = [] - # for lang in sorted(lX.keys()): - # _ly.append(ly[lang]) - # ly = torch.cat(_ly, dim=0) - # predictions = torch.sigmoid(logits) > 0.5 - # accuracy = self.accuracy(predictions, ly) - # microF1 = self.microF1_te(predictions, ly) - # macroF1 = self.macroF1_te(predictions, ly) - # self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) - # self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - # self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - # return + def test_step(self, test_batch, batch_idx): + X, y, _, batch_langs = test_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + # y = y.type(torch.cuda.FloatTensor) + y = y.type(torch.FloatTensor) + y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return def configure_optimizers(self, lr=3e-5, weight_decay=0.01): no_decay = ['bias', 'LayerNorm.weight'] diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index ed70e80..ad3cc99 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -15,7 +15,7 @@ class RecurrentModel(pl.LightningModule): def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, drop_embedding_range, drop_embedding_prop, gpus=None): """ - + Init RNN model. :param lPretrained: :param langs: :param output_size: diff --git a/refactor/util/common.py b/refactor/util/common.py index 88e4630..56ca47d 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -161,6 +161,9 @@ class MultilingualIndex: def l_val_raw_index(self): return {l: index.val_raw for l, index in self.l_index.items()} + def l_test_raw_index(self): + return {l: index.test_raw for l, index in self.l_index.items()} + def l_val_target(self): return {l: index.val_target for l, index in self.l_index.items()} @@ -170,10 +173,6 @@ class MultilingualIndex: def l_test_index(self): return {l: index.test_index for l, index in self.l_index.items()} - def l_test_raw(self): - print('TODO: implement MultilingualIndex method to return RAW test data!') - return {l: index.test_raw for l, index in self.l_index.items()} - def l_devel_index(self): return {l: index.devel_index for l, index in self.l_index.items()} @@ -195,6 +194,9 @@ class MultilingualIndex: def l_val_raw(self): return self.l_val_raw_index(), self.l_val_target() + def l_test_raw(self): + return self.l_test_raw_index(), self.l_test_target() + def get_l_pad_index(self): return {l: index.get_pad_index() for l, index in self.l_index.items()} diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 8f1f191..d228653 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -228,7 +228,6 @@ class RecurrentGen(ViewGen): """ l_pad = self.multilingualIndex.l_pad() data = self.multilingualIndex.l_devel_index() - # trainer = Trainer(gpus=self.gpus) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() time_init = time() @@ -238,7 +237,7 @@ class RecurrentGen(ViewGen): return l_embeds def fit_transform(self, lX, ly): - pass + return self.fit(lX, ly).transform(lX) class BertGen(ViewGen): @@ -268,7 +267,12 @@ class BertGen(ViewGen): return self def transform(self, lX): - # lX is raw text data. It has to be first indexed via multilingualIndex Vectorizer. + # lX is raw text data. It has to be first indexed via Bert Tokenizer. + data = 'TOKENIZE THIS' + self.model.to('cuda' if self.gpus else 'cpu') + self.model.eval() + time_init = time() + l_emebds = self.model.encode(data) pass def fit_transform(self, lX, ly): From ae0ea1e68c56fe4e37466b90905c493277a0e926 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 12:48:02 +0100 Subject: [PATCH 24/55] Implemented inference functions for bert (cpu and gpu) --- refactor/models/pl_bert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 7503a47..965690c 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -47,7 +47,7 @@ class BertModel(pl.LightningModule): X = torch.cat(X).view([X[0].shape[0], len(X)]) # y = y.type(torch.cuda.FloatTensor) y = y.type(torch.FloatTensor) - y.to('cuda' if self.gpus else 'cpu') + y = y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) loss = self.loss(logits, y) # Squashing logits through Sigmoid in order to get confidence score @@ -116,7 +116,7 @@ class BertModel(pl.LightningModule): X = torch.cat(X).view([X[0].shape[0], len(X)]) # y = y.type(torch.cuda.FloatTensor) y = y.type(torch.FloatTensor) - y.to('cuda' if self.gpus else 'cpu') + y = y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) loss = self.loss(logits, y) predictions = torch.sigmoid(logits) > 0.5 @@ -136,7 +136,7 @@ class BertModel(pl.LightningModule): X = torch.cat(X).view([X[0].shape[0], len(X)]) # y = y.type(torch.cuda.FloatTensor) y = y.type(torch.FloatTensor) - y.to('cuda' if self.gpus else 'cpu') + y = y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) loss = self.loss(logits, y) # Squashing logits through Sigmoid in order to get confidence score From 0b54864514809ff709bf02bc49a6d5d69356009a Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 16:20:57 +0100 Subject: [PATCH 25/55] Implemented funnelling architecture --- refactor/data/datamodule.py | 24 ++++++--- refactor/funnelling.py | 95 +++++++++++++++++++++++++++++++++++ refactor/main.py | 44 +++++++++++----- refactor/models/learners.py | 39 +++++++++++++- refactor/models/pl_bert.py | 28 +++++------ refactor/models/pl_gru.py | 19 +++---- refactor/util/common.py | 4 +- refactor/util/standardizer.py | 36 +++++++++++++ refactor/view_generators.py | 20 +++++--- 9 files changed, 254 insertions(+), 55 deletions(-) create mode 100644 refactor/funnelling.py create mode 100644 refactor/util/standardizer.py diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index c80fc84..7329f08 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -105,21 +105,25 @@ class RecurrentDataModule(pl.LightningDataModule): if stage == 'fit' or stage is None: l_train_index, l_train_target = self.multilingualIndex.l_train() # Debug settings: reducing number of samples - l_train_index = {l: train[:50] for l, train in l_train_index.items()} - l_train_target = {l: target[:50] for l, target in l_train_target.items()} + l_train_index = {l: train[:5] for l, train in l_train_index.items()} + l_train_target = {l: target[:5] for l, target in l_train_target.items()} self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) l_val_index, l_val_target = self.multilingualIndex.l_val() # Debug settings: reducing number of samples - l_val_index = {l: train[:50] for l, train in l_val_index.items()} - l_val_target = {l: target[:50] for l, target in l_val_target.items()} + l_val_index = {l: train[:5] for l, train in l_val_index.items()} + l_val_target = {l: target[:5] for l, target in l_val_target.items()} self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) if stage == 'test' or stage is None: l_test_index, l_test_target = self.multilingualIndex.l_test() + # Debug settings: reducing number of samples + l_test_index = {l: train[:5] for l, train in l_test_index.items()} + l_test_target = {l: target[:5] for l, target in l_test_target.items()} + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) @@ -145,8 +149,8 @@ class BertDataModule(RecurrentDataModule): if stage == 'fit' or stage is None: l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() # Debug settings: reducing number of samples - # l_train_raw = {l: train[:50] for l, train in l_train_raw.items()} - # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} + l_train_target = {l: target[:5] for l, target in l_train_target.items()} l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) self.training_dataset = RecurrentDataset(l_train_index, l_train_target, @@ -154,8 +158,8 @@ class BertDataModule(RecurrentDataModule): l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() # Debug settings: reducing number of samples - # l_val_raw = {l: train[:50] for l, train in l_val_raw.items()} - # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} + l_val_target = {l: target[:5] for l, target in l_val_target.items()} l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, @@ -163,6 +167,10 @@ class BertDataModule(RecurrentDataModule): if stage == 'test' or stage is None: l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() + # Debug settings: reducing number of samples + l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} + l_test_target = {l: target[:5] for l, target in l_test_target.items()} + l_test_index = self.tokenize(l_test_raw, max_len=self.max_len) self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) diff --git a/refactor/funnelling.py b/refactor/funnelling.py new file mode 100644 index 0000000..33fcce3 --- /dev/null +++ b/refactor/funnelling.py @@ -0,0 +1,95 @@ +from models.learners import * +from view_generators import VanillaFunGen +from util.common import _normalize + + +class DocEmbedderList: + def __init__(self, embedder_list, probabilistic=True): + """ + Class that takes care of calling fit and transform function for every init embedder. + :param embedder_list: list of embedders to be deployed + :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not + """ + assert len(embedder_list) != 0, 'Embedder list cannot be empty!' + self.embedders = embedder_list + self.probabilistic = probabilistic + if probabilistic: + _tmp = [] + for embedder in self.embedders: + if isinstance(embedder, VanillaFunGen): + _tmp.append(embedder) + else: + _tmp.append(FeatureSet2Posteriors(embedder)) + self.embedders = _tmp + + def fit(self, lX, ly): + for embedder in self.embedders: + embedder.fit(lX, ly) + return self + + def transform(self, lX): + langs = sorted(lX.keys()) + lZparts = {lang: None for lang in langs} + + for embedder in self.embedders: + lZ = embedder.transform(lX) + for lang in langs: + Z = lZ[lang] + if lZparts[lang] is None: + lZparts[lang] = Z + else: + lZparts[lang] += Z + n_embedders = len(self.embedders) + return {lang: lZparts[lang]/n_embedders for lang in langs} + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class FeatureSet2Posteriors: + def __init__(self, embedder, l2=True, n_jobs=-1): + self.embedder = embedder + self.l2 = l2 + self.n_jobs = n_jobs + self.prob_classifier = MetaClassifier( + SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + + def fit(self, lX, ly): + lZ = self.embedder.fit_transform(lX, ly) + self.prob_classifier.fit(lZ, ly) + return self + + def transform(self, lX): + lP = self.predict_proba(lX) + lP = _normalize(lP, self.l2) + return lP + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + def predict(self, lX): + lZ = self.embedder.transform(lX) + return self.prob_classifier.predict(lZ) + + def predict_proba(self, lX): + lZ = self.embedder.transform(lX) + return self.prob_classifier.predict_proba(lZ) + + +class Funnelling: + def __init__(self, first_tier: DocEmbedderList, n_jobs=-1): + self.first_tier = first_tier + self.meta = MetaClassifier( + SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + self.n_jobs = n_jobs + + def fit(self, lX, ly): + print('## Fitting first-tier learners!') + lZ = self.first_tier.fit_transform(lX, ly) + print('## Fitting meta-learner!') + self.meta.fit(lZ, ly) + + def predict(self, lX): + lZ = self.first_tier.transform(lX) + ly = self.meta.predict(lZ) + return ly diff --git a/refactor/main.py b/refactor/main.py index bb71bd1..2ccb0b2 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -1,8 +1,9 @@ from argparse import ArgumentParser -from util.embeddings_manager import MuseLoader -from view_generators import RecurrentGen, BertGen +from funnelling import * +from view_generators import * from data.dataset_builder import MultilingualDataset from util.common import MultilingualIndex +from util.evaluation import evaluate from time import time @@ -25,21 +26,38 @@ def main(args): lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) - # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) - # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) - # gFun = WordClassGen(n_jobs=N_JOBS) - # gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, - # nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) - gFun = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + # posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) + museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) + wceEmbedder = WordClassGen(n_jobs=N_JOBS) + # rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, + # nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) + # bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + docEmbedders = DocEmbedderList([museEmbedder, wceEmbedder]) + + gfun = Funnelling(first_tier=docEmbedders) + + # Training --------------------------------------- + print('\n[Training Generalized Funnelling]') time_init = time() - gFun.fit(lX, ly) + time_tr = time() + gfun.fit(lX, ly) + time_tr = round(time() - time_tr, 3) + print(f'Training completed in {time_tr} seconds!') - # print('Projecting...') - # y_ = gFun.transform(lX) + # Testing ---------------------------------------- + print('\n[Testing Generalized Funnelling]') + time_te = time() + ly_ = gfun.predict(lXte) - train_time = round(time() - time_init, 3) - exit(f'Executed! Training time: {train_time}!') + l_eval = evaluate(ly_true=ly, ly_pred=ly_) + print(l_eval) + + time_te = round(time() - time_te, 3) + print(f'Testing completed in {time_te} seconds!') + + overall_time = round(time() - time_init, 3) + exit(f'\nExecuted in: {overall_time } seconds!') if __name__ == '__main__': diff --git a/refactor/models/learners.py b/refactor/models/learners.py index fcd4249..1c60072 100644 --- a/refactor/models/learners.py +++ b/refactor/models/learners.py @@ -5,6 +5,7 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC from joblib import Parallel, delayed +from util.standardizer import StandardizeTransformer def get_learner(calibrate=False, kernel='linear', C=1): @@ -156,7 +157,6 @@ class MonolingualClassifier: self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, error_score=0, verbose=10) - # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') self.model.fit(X, y) if isinstance(self.model, GridSearchCV): @@ -183,3 +183,40 @@ class MonolingualClassifier: def best_params(self): return self.best_params_ + + +class MetaClassifier: + + def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): + self.n_jobs = n_jobs + self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) + self.standardize_range = standardize_range + + def fit(self, lZ, ly): + tinit = time.time() + Z, y = self.stack(lZ, ly) + + self.standardizer = StandardizeTransformer(range=self.standardize_range) + Z = self.standardizer.fit_transform(Z) + + print('fitting the Z-space of shape={}'.format(Z.shape)) + self.model.fit(Z, y) + self.time = time.time() - tinit + + def stack(self, lZ, ly=None): + langs = list(lZ.keys()) + Z = np.vstack([lZ[lang] for lang in langs]) + if ly is not None: + y = np.vstack([ly[lang] for lang in langs]) + return Z, y + else: + return Z + + def predict(self, lZ): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + + def predict_proba(self, lZ): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) + diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 965690c..c19f455 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -45,7 +45,6 @@ class BertModel(pl.LightningModule): def training_step(self, train_batch, batch_idx): X, y, _, batch_langs = train_batch X = torch.cat(X).view([X[0].shape[0], len(X)]) - # y = y.type(torch.cuda.FloatTensor) y = y.type(torch.FloatTensor) y = y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) @@ -64,18 +63,6 @@ class BertModel(pl.LightningModule): lX, ly = self._reconstruct_dict(predictions, y, batch_langs) return {'loss': loss, 'pred': lX, 'target': ly} - def _reconstruct_dict(self, predictions, y, batch_langs): - reconstructed_x = {lang: [] for lang in set(batch_langs)} - reconstructed_y = {lang: [] for lang in set(batch_langs)} - for i, pred in enumerate(predictions): - reconstructed_x[batch_langs[i]].append(pred) - reconstructed_y[batch_langs[i]].append(y[i]) - for k, v in reconstructed_x.items(): - reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) - for k, v in reconstructed_y.items(): - reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) - return reconstructed_x, reconstructed_y - def training_epoch_end(self, outputs): langs = [] for output in outputs: @@ -114,7 +101,6 @@ class BertModel(pl.LightningModule): def validation_step(self, val_batch, batch_idx): X, y, _, batch_langs = val_batch X = torch.cat(X).view([X[0].shape[0], len(X)]) - # y = y.type(torch.cuda.FloatTensor) y = y.type(torch.FloatTensor) y = y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) @@ -134,7 +120,6 @@ class BertModel(pl.LightningModule): def test_step(self, test_batch, batch_idx): X, y, _, batch_langs = test_batch X = torch.cat(X).view([X[0].shape[0], len(X)]) - # y = y.type(torch.cuda.FloatTensor) y = y.type(torch.FloatTensor) y = y.to('cuda' if self.gpus else 'cpu') logits, _ = self.forward(X) @@ -164,3 +149,16 @@ class BertModel(pl.LightningModule): optimizer = AdamW(optimizer_grouped_parameters, lr=lr) scheduler = StepLR(optimizer, step_size=25, gamma=0.1) return [optimizer], [scheduler] + + @staticmethod + def _reconstruct_dict(predictions, y, batch_langs): + reconstructed_x = {lang: [] for lang in set(batch_langs)} + reconstructed_y = {lang: [] for lang in set(batch_langs)} + for i, pred in enumerate(predictions): + reconstructed_x[batch_langs[i]].append(pred) + reconstructed_y[batch_langs[i]].append(y[i]) + for k, v in reconstructed_x.items(): + reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) + for k, v in reconstructed_y.items(): + reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) + return reconstructed_x, reconstructed_y diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index ad3cc99..a13990c 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -164,15 +164,6 @@ class RecurrentModel(pl.LightningModule): re_lX = self._reconstruct_dict(predictions, ly) return {'loss': loss, 'pred': re_lX, 'target': ly} - def _reconstruct_dict(self, X, ly): - reconstructed = {} - _start = 0 - for lang in sorted(ly.keys()): - lang_batchsize = len(ly[lang]) - reconstructed[lang] = X[_start:_start+lang_batchsize] - _start += lang_batchsize - return reconstructed - def training_epoch_end(self, outputs): # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language @@ -265,3 +256,13 @@ class RecurrentModel(pl.LightningModule): optimizer = AdamW(self.parameters(), lr=1e-3) scheduler = StepLR(optimizer, step_size=25, gamma=0.5) return [optimizer], [scheduler] + + @staticmethod + def _reconstruct_dict(X, ly): + reconstructed = {} + _start = 0 + for lang in sorted(ly.keys()): + lang_batchsize = len(ly[lang]) + reconstructed[lang] = X[_start:_start+lang_batchsize] + _start += lang_batchsize + return reconstructed diff --git a/refactor/util/common.py b/refactor/util/common.py index 56ca47d..1b84d60 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -311,8 +311,8 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): unk_count = 0 knw_count = 0 out_count = 0 - pbar = tqdm(data, desc=f'indexing') - for text in pbar: + # pbar = tqdm(data, desc=f'indexing') + for text in data: words = analyzer(text) index = [] for word in words: diff --git a/refactor/util/standardizer.py b/refactor/util/standardizer.py new file mode 100644 index 0000000..429bccd --- /dev/null +++ b/refactor/util/standardizer.py @@ -0,0 +1,36 @@ +import numpy as np + + +class StandardizeTransformer: + def __init__(self, axis=0, range=None): + """ + + :param axis: + :param range: + """ + assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' + self.axis = axis + self.yetfit = False + self.range = range + + def fit(self, X): + print('Applying z-score standardization...') + std=np.std(X, axis=self.axis, ddof=1) + self.std = np.clip(std, 1e-5, None) + self.mean = np.mean(X, axis=self.axis) + if self.range is not None: + ones = np.ones_like(self.std) + zeros = np.zeros_like(self.mean) + ones[self.range] = self.std[self.range] + zeros[self.range] = self.mean[self.range] + self.std = ones + self.mean = zeros + self.yetfit=True + return self + + def transform(self, X): + if not self.yetfit: 'transform called before fit' + return (X - self.mean) / self.std + + def fit_transform(self, X): + return self.fit(X).transform(X) \ No newline at end of file diff --git a/refactor/view_generators.py b/refactor/view_generators.py index d228653..3b3d811 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -55,6 +55,7 @@ class VanillaFunGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, lY): + print('# Fitting VanillaFunGen...') lX = self.vectorizer.fit_transform(lX) self.doc_projector.fit(lX, lY) return self @@ -84,6 +85,7 @@ class MuseGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): + print('# Fitting MuseGen...') self.vectorizer.fit(lX) self.langs = sorted(lX.keys()) self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) @@ -105,7 +107,6 @@ class MuseGen(ViewGen): class WordClassGen(ViewGen): - def __init__(self, n_jobs=-1): """ generates document representation via Word-Class-Embeddings. @@ -119,6 +120,7 @@ class WordClassGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): + print('# Fitting WordClassGen...') lX = self.vectorizer.fit_transform(lX) self.langs = sorted(lX.keys()) wce = Parallel(n_jobs=self.n_jobs)( @@ -171,7 +173,7 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn_dev', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') def _init_model(self): @@ -205,6 +207,7 @@ class RecurrentGen(ViewGen): :param ly: :return: """ + print('# Fitting RecurrentGen...') recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, checkpoint_callback=False) @@ -241,7 +244,6 @@ class RecurrentGen(ViewGen): class BertGen(ViewGen): - def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): super().__init__() self.multilingualIndex = multilingualIndex @@ -251,13 +253,14 @@ class BertGen(ViewGen): self.n_jobs = n_jobs self.stored_path = stored_path self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert_dev', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) def _init_model(self): output_size = self.multilingualIndex.get_target_dim() return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) def fit(self, lX, ly): + print('# Fitting BertGen...') self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, @@ -272,11 +275,14 @@ class BertGen(ViewGen): self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() time_init = time() - l_emebds = self.model.encode(data) - pass + l_emebds = self.model.encode(data) # TODO + transform_time = round(time() - time_init, 3) + print(f'Executed! Transform took: {transform_time}') + exit('BERT VIEWGEN TRANSFORM NOT IMPLEMENTED!') + return l_emebds def fit_transform(self, lX, ly): # we can assume that we have already indexed data for transform() since we are first calling fit() - pass + return self.fit(lX, ly).transform(lX) From 8fa8ae5989bf1c097f7072d9ea54d9c8cb2c4905 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 16:38:05 +0100 Subject: [PATCH 26/55] Implemented funnelling architecture --- refactor/main.py | 28 +++++++++++++++++--- refactor/util/results_csv.py | 51 ++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 refactor/util/results_csv.py diff --git a/refactor/main.py b/refactor/main.py index 2ccb0b2..d2ab71b 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -4,6 +4,7 @@ from view_generators import * from data.dataset_builder import MultilingualDataset from util.common import MultilingualIndex from util.evaluation import evaluate +from util.results_csv import CSVlog from time import time @@ -49,13 +50,34 @@ def main(args): print('\n[Testing Generalized Funnelling]') time_te = time() ly_ = gfun.predict(lXte) - l_eval = evaluate(ly_true=ly, ly_pred=ly_) - print(l_eval) - time_te = round(time() - time_te, 3) print(f'Testing completed in {time_te} seconds!') + # Logging --------------------------------------- + print('\n[Results]') + results = CSVlog('test_log.csv') + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') + results.add_row(method='gfun', + setting='TODO', + sif='TODO', + zscore='TRUE', + l2='TRUE', + dataset='TODO', + time_tr=time_tr, + time_te=time_te, + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') + print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) + overall_time = round(time() - time_init, 3) exit(f'\nExecuted in: {overall_time } seconds!') diff --git a/refactor/util/results_csv.py b/refactor/util/results_csv.py new file mode 100644 index 0000000..85a7de1 --- /dev/null +++ b/refactor/util/results_csv.py @@ -0,0 +1,51 @@ +import os +import pandas as pd +import numpy as np + + +class CSVlog: + def __init__(self, file, autoflush=True, verbose=False): + self.file = file + self.columns = ['method', + 'setting', + 'sif', + 'zscore', + 'l2', + 'dataset', + 'time_tr', + 'time_te', + 'lang', + 'macrof1', + 'microf1', + 'macrok', + 'microk', + 'notes'] + self.autoflush = autoflush + self.verbose = verbose + if os.path.exists(file): + self.tell('Loading existing file from {}'.format(file)) + self.df = pd.read_csv(file, sep='\t') + else: + self.tell('File {} does not exist. Creating new frame.'.format(file)) + dir = os.path.dirname(self.file) + if dir and not os.path.exists(dir): os.makedirs(dir) + self.df = pd.DataFrame(columns=self.columns) + + def already_calculated(self, id): + return (self.df['id'] == id).any() + + def add_row(self, method, setting, sif, zscore, l2, dataset, time_tr, time_te, lang, + macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, setting,sif, zscore, l2, dataset, time_tr, time_te, lang, + macrof1, microf1, macrok, microk, notes], + index=self.columns) + self.df = self.df.append(s, ignore_index=True) + if self.autoflush: self.flush() + self.tell(s.to_string()) + + def flush(self): + self.df.to_csv(self.file, index=False, sep='\t') + + def tell(self, msg): + if self.verbose: + print(msg) From 93436fc596507470a83da1197564f94ffef52c71 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 17:20:17 +0100 Subject: [PATCH 27/55] Implemented funnelling architecture --- refactor/data/datamodule.py | 32 +++++++++++++++++++------------- refactor/main.py | 8 +++++--- refactor/models/pl_bert.py | 22 +++++++++++++++++++++- refactor/models/pl_gru.py | 4 ++-- refactor/util/common.py | 6 ++++++ refactor/view_generators.py | 8 ++++---- 6 files changed, 57 insertions(+), 23 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 7329f08..711d5a3 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -140,6 +140,22 @@ class RecurrentDataModule(pl.LightningDataModule): collate_fn=self.test_dataset.collate_fn) +def tokenize(l_raw, max_len): + """ + run Bert tokenization on dict {lang: list of samples}. + :param l_raw: + :param max_len: + :return: + """ + # TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + l_tokenized = {} + for lang in l_raw.keys(): + output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') + l_tokenized[lang] = output_tokenizer['input_ids'] + return l_tokenized + + class BertDataModule(RecurrentDataModule): def __init__(self, multilingualIndex, batchsize=64, max_len=512): super().__init__(multilingualIndex, batchsize) @@ -152,7 +168,7 @@ class BertDataModule(RecurrentDataModule): l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} l_train_target = {l: target[:5] for l, target in l_train_target.items()} - l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) + l_train_index = tokenize(l_train_raw, max_len=self.max_len) self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) @@ -161,7 +177,7 @@ class BertDataModule(RecurrentDataModule): l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} l_val_target = {l: target[:5] for l, target in l_val_target.items()} - l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) + l_val_index = tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) @@ -171,20 +187,10 @@ class BertDataModule(RecurrentDataModule): l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} l_test_target = {l: target[:5] for l, target in l_test_target.items()} - l_test_index = self.tokenize(l_test_raw, max_len=self.max_len) + l_test_index = tokenize(l_test_raw, max_len=self.max_len) self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) - @staticmethod - def tokenize(l_raw, max_len): - # TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - l_tokenized = {} - for lang in l_raw.keys(): - output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') - l_tokenized[lang] = output_tokenizer['input_ids'] - return l_tokenized - def train_dataloader(self): """ NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" diff --git a/refactor/main.py b/refactor/main.py index d2ab71b..17f5a95 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -28,12 +28,14 @@ def main(args): multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) # posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) - museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) - wceEmbedder = WordClassGen(n_jobs=N_JOBS) + # museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) + # wceEmbedder = WordClassGen(n_jobs=N_JOBS) # rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, # nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) - # bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + bertEmbedder.transform(lX) + exit() docEmbedders = DocEmbedderList([museEmbedder, wceEmbedder]) gfun = Funnelling(first_tier=docEmbedders) diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index c19f455..11fe0ce 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -3,6 +3,7 @@ import pytorch_lightning as pl from torch.optim.lr_scheduler import StepLR from transformers import BertForSequenceClassification, AdamW from util.pl_metrics import CustomF1, CustomK +from util.common import define_pad_length, pad class BertModel(pl.LightningModule): @@ -70,7 +71,7 @@ class BertModel(pl.LightningModule): langs = set(langs) # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language - # TODO: this is horrible... + # TODO: make this a function (reused in pl_gru epoch_end) res_macroF1 = {lang: [] for lang in langs} res_microF1 = {lang: [] for lang in langs} res_macroK = {lang: [] for lang in langs} @@ -150,6 +151,25 @@ class BertModel(pl.LightningModule): scheduler = StepLR(optimizer, step_size=25, gamma=0.1) return [optimizer], [scheduler] + def encode(self, lX, batch_size=64): + with torch.no_grad(): + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i + batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i + batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index='101', max_pad_length=max_pad_len) # TODO: check pad index! + batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') + _, output = self.forward(batch) + doc_embeds = output[-1][:, 0, :] + l_embed[lang].append(doc_embeds.cpu()) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0).numpy() + return l_embed + @staticmethod def _reconstruct_dict(predictions, y, batch_langs): reconstructed_x = {lang: [] for lang in set(batch_langs)} diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index a13990c..ca4f8da 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -137,9 +137,9 @@ class RecurrentModel(pl.LightningModule): output = output[-1, :, :] output = F.relu(self.linear0(output)) output = self.dropout(F.relu(self.linear1(output))) - l_embed[lang].append(output) + l_embed[lang].append(output.cpu()) for k, v in l_embed.items(): - l_embed[k] = torch.cat(v, dim=0).cpu().numpy() + l_embed[k] = torch.cat(v, dim=0).numpy() return l_embed def training_step(self, train_batch, batch_idx): diff --git a/refactor/util/common.py b/refactor/util/common.py index 1b84d60..575570a 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -164,6 +164,9 @@ class MultilingualIndex: def l_test_raw_index(self): return {l: index.test_raw for l, index in self.l_index.items()} + def l_devel_raw_index(self): + return {l: index.devel_raw for l, index in self.l_index.items()} + def l_val_target(self): return {l: index.val_target for l, index in self.l_index.items()} @@ -197,6 +200,9 @@ class MultilingualIndex: def l_test_raw(self): return self.l_test_raw_index(), self.l_test_target() + def l_devel_raw(self): + return self.l_devel_raw_index(), self.l_devel_target() + def get_l_pad_index(self): return {l: index.get_pad_index() for l, index in self.l_index.items()} diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 3b3d811..ca4ff93 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -21,7 +21,7 @@ from util.common import TfidfVectorizerMultilingual, _normalize from models.pl_gru import RecurrentModel from models.pl_bert import BertModel from pytorch_lightning import Trainer -from data.datamodule import RecurrentDataModule, BertDataModule +from data.datamodule import RecurrentDataModule, BertDataModule, tokenize from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger from time import time @@ -271,14 +271,14 @@ class BertGen(ViewGen): def transform(self, lX): # lX is raw text data. It has to be first indexed via Bert Tokenizer. - data = 'TOKENIZE THIS' + data = self.multilingualIndex.l_devel_raw_index() + data = tokenize(data, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() time_init = time() - l_emebds = self.model.encode(data) # TODO + l_emebds = self.model.encode(data, batch_size=64) transform_time = round(time() - time_init, 3) print(f'Executed! Transform took: {transform_time}') - exit('BERT VIEWGEN TRANSFORM NOT IMPLEMENTED!') return l_emebds def fit_transform(self, lX, ly): From 111f759cd41a752972a265adf5c0b273e932bd7c Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 17:20:52 +0100 Subject: [PATCH 28/55] Implemented funnelling architecture --- refactor/main.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/refactor/main.py b/refactor/main.py index 17f5a95..d2ab71b 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -28,14 +28,12 @@ def main(args): multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) # posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) - # museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) - # wceEmbedder = WordClassGen(n_jobs=N_JOBS) + museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) + wceEmbedder = WordClassGen(n_jobs=N_JOBS) # rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, # nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) - bertEmbedder.transform(lX) + # bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) - exit() docEmbedders = DocEmbedderList([museEmbedder, wceEmbedder]) gfun = Funnelling(first_tier=docEmbedders) From 94866e5ad81e848973c2a54f8970dc86a8a0d009 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 17:46:03 +0100 Subject: [PATCH 29/55] Implemented funnelling architecture --- refactor/funnelling.py | 5 ++--- refactor/main.py | 50 +++++++++++++++++++++++++++++++---------- refactor/util/common.py | 10 ++++++++- 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/refactor/funnelling.py b/refactor/funnelling.py index 33fcce3..6c79ae9 100644 --- a/refactor/funnelling.py +++ b/refactor/funnelling.py @@ -77,10 +77,9 @@ class FeatureSet2Posteriors: class Funnelling: - def __init__(self, first_tier: DocEmbedderList, n_jobs=-1): + def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): self.first_tier = first_tier - self.meta = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + self.meta = meta_classifier self.n_jobs = n_jobs def fit(self, lX, ly): diff --git a/refactor/main.py b/refactor/main.py index d2ab71b..a1f5eef 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -2,13 +2,14 @@ from argparse import ArgumentParser from funnelling import * from view_generators import * from data.dataset_builder import MultilingualDataset -from util.common import MultilingualIndex +from util.common import MultilingualIndex, get_params from util.evaluation import evaluate from util.results_csv import CSVlog from time import time def main(args): + OPTIMC = True # TODO N_JOBS = 8 print('Running refactored...') @@ -27,16 +28,36 @@ def main(args): lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) - # posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) - museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) - wceEmbedder = WordClassGen(n_jobs=N_JOBS) - # rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, - # nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) - # bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + embedder_list = [] + if args.X: + posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) + embedder_list.append(posteriorEmbedder) - docEmbedders = DocEmbedderList([museEmbedder, wceEmbedder]) + if args.M: + museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) + embedder_list.append(museEmbedder) - gfun = Funnelling(first_tier=docEmbedders) + if args.W: + wceEmbedder = WordClassGen(n_jobs=N_JOBS) + embedder_list.append(wceEmbedder) + + if args.G: + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, + nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) + embedder_list.append(rnnEmbedder) + + if args.B: + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + embedder_list.append(bertEmbedder) + + # Init DocEmbedderList + docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) + meta_parameters = None if not OPTIMC else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=meta_parameters), + meta_parameters=get_params(optimc=True)) + + # Init Funnelling Architecture + gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) # Training --------------------------------------- print('\n[Training Generalized Funnelling]') @@ -64,9 +85,9 @@ def main(args): print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') results.add_row(method='gfun', setting='TODO', - sif='TODO', - zscore='TRUE', - l2='TRUE', + sif='True', + zscore='True', + l2='True', dataset='TODO', time_tr=time_tr, time_te=time_te, @@ -84,6 +105,11 @@ def main(args): if __name__ == '__main__': parser = ArgumentParser() + parser.add_argument('--X') + parser.add_argument('--M') + parser.add_argument('--W') + parser.add_argument('--G') + parser.add_argument('--B') parser.add_argument('--gpus', default=None) args = parser.parse_args() main(args) diff --git a/refactor/util/common.py b/refactor/util/common.py index 575570a..3ffda78 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -360,4 +360,12 @@ def pad(index_list, pad_index, max_pad_length=None): pad_length = min(pad_length, max_pad_length) for i, indexes in enumerate(index_list): index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] - return index_list \ No newline at end of file + return index_list + + +def get_params(optimc=False): + if not optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] \ No newline at end of file From a5af2134bf06f56b0f0a83cff47e7cd39f1c080e Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 25 Jan 2021 18:25:08 +0100 Subject: [PATCH 30/55] Implemented funnelling architecture --- refactor/main.py | 9 +++++---- refactor/view_generators.py | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/refactor/main.py b/refactor/main.py index a1f5eef..027649b 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -9,7 +9,7 @@ from time import time def main(args): - OPTIMC = True # TODO + OPTIMC = False # TODO N_JOBS = 8 print('Running refactored...') @@ -20,6 +20,7 @@ def main(args): EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) data.set_view(languages=['it', 'fr']) + data.show_dimensions() lX, ly = data.training() lXte, lyte = data.test() @@ -53,8 +54,8 @@ def main(args): # Init DocEmbedderList docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) meta_parameters = None if not OPTIMC else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=meta_parameters), - meta_parameters=get_params(optimc=True)) + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), + meta_parameters=get_params(optimc=OPTIMC)) # Init Funnelling Architecture gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) @@ -71,7 +72,7 @@ def main(args): print('\n[Testing Generalized Funnelling]') time_te = time() ly_ = gfun.predict(lXte) - l_eval = evaluate(ly_true=ly, ly_pred=ly_) + l_eval = evaluate(ly_true=lyte, ly_pred=ly_) time_te = round(time() - time_te, 3) print(f'Testing completed in {time_te} seconds!') diff --git a/refactor/view_generators.py b/refactor/view_generators.py index ca4ff93..579b8f1 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -55,7 +55,7 @@ class VanillaFunGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, lY): - print('# Fitting VanillaFunGen...') + print('# Fitting VanillaFunGen (X)...') lX = self.vectorizer.fit_transform(lX) self.doc_projector.fit(lX, lY) return self @@ -85,7 +85,7 @@ class MuseGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): - print('# Fitting MuseGen...') + print('# Fitting MuseGen (M)...') self.vectorizer.fit(lX) self.langs = sorted(lX.keys()) self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) @@ -120,7 +120,7 @@ class WordClassGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): - print('# Fitting WordClassGen...') + print('# Fitting WordClassGen (W)...') lX = self.vectorizer.fit_transform(lX) self.langs = sorted(lX.keys()) wce = Parallel(n_jobs=self.n_jobs)( @@ -207,7 +207,7 @@ class RecurrentGen(ViewGen): :param ly: :return: """ - print('# Fitting RecurrentGen...') + print('# Fitting RecurrentGen (G)...') recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, checkpoint_callback=False) @@ -260,7 +260,7 @@ class BertGen(ViewGen): return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) def fit(self, lX, ly): - print('# Fitting BertGen...') + print('# Fitting BertGen (M)...') self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, From 108f423d415c79e0e58b306f66ad191d85ec640d Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 10:15:55 +0100 Subject: [PATCH 31/55] Implemented funnelling architecture --- refactor/util/SIF_embed.py | 5 ++++- refactor/util/common.py | 2 +- refactor/view_generators.py | 15 +++++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/refactor/util/SIF_embed.py b/refactor/util/SIF_embed.py index cfe096e..4a3d712 100644 --- a/refactor/util/SIF_embed.py +++ b/refactor/util/SIF_embed.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.decomposition import TruncatedSVD + def get_weighted_average(We, x, w): """ Compute the weighted average vectors @@ -15,6 +16,7 @@ def get_weighted_average(We, x, w): emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) return emb + def compute_pc(X,npc=1): """ Compute the principal components. @@ -26,6 +28,7 @@ def compute_pc(X,npc=1): svd.fit(X) return svd.components_ + def remove_pc(X, npc=1): """ Remove the projection on the principal components @@ -34,7 +37,7 @@ def remove_pc(X, npc=1): :return: XX[i, :] is the data point after removing its projection """ pc = compute_pc(X, npc) - if npc==1: + if npc == 1: XX = X - X.dot(pc.transpose()) * pc else: XX = X - X.dot(pc.transpose()).dot(pc) diff --git a/refactor/util/common.py b/refactor/util/common.py index 3ffda78..a624528 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -368,4 +368,4 @@ def get_params(optimc=False): return None c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] \ No newline at end of file + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 579b8f1..2d82a20 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -41,17 +41,20 @@ class ViewGen(ABC): class VanillaFunGen(ViewGen): - def __init__(self, base_learner, n_jobs=-1): + def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): """ Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to return posterior probabilities. + :param base_learner: :param n_jobs: integer, number of concurrent workers """ super().__init__() self.learners = base_learner + self.first_tier_parameters = first_tier_parameters self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier(self.learners) + self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners, + parameters=self.first_tier_parameters, n_jobs=self.n_jobs) self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, lY): @@ -61,8 +64,16 @@ class VanillaFunGen(ViewGen): return self def transform(self, lX): + """ + (1) Vectorize documents + (2) Project them according to the learners SVMs + (3) Apply L2 normalization to the projection + :param lX: + :return: + """ lX = self.vectorizer.transform(lX) lZ = self.doc_projector.predict_proba(lX) + lZ = _normalize(lZ, l2=True) return lZ def fit_transform(self, lX, ly): From 90e974f0a3d2eb4e2051dcf39efd918ddab50448 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 12:40:23 +0100 Subject: [PATCH 32/55] Parser + fixed bert pad token id --- refactor/data/datamodule.py | 1 - refactor/main.py | 149 +++++++++++++++++++++++------------ refactor/models/pl_bert.py | 2 +- refactor/util/common.py | 13 +++ refactor/util/file.py | 8 +- refactor/util/results_csv.py | 5 +- 6 files changed, 124 insertions(+), 54 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 711d5a3..12d7e02 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -147,7 +147,6 @@ def tokenize(l_raw, max_len): :param max_len: :return: """ - # TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') l_tokenized = {} for lang in l_raw.keys(): diff --git a/refactor/main.py b/refactor/main.py index 027649b..bab9189 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -2,60 +2,57 @@ from argparse import ArgumentParser from funnelling import * from view_generators import * from data.dataset_builder import MultilingualDataset -from util.common import MultilingualIndex, get_params +from util.common import MultilingualIndex, get_params, get_method_name from util.evaluation import evaluate from util.results_csv import CSVlog from time import time def main(args): - OPTIMC = False # TODO - N_JOBS = 8 - print('Running refactored...') + assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \ + 'empty set of document embeddings is not allowed!' - # _DATASET = '/homenfs/a.pedrotti1/datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' - # EMBEDDINGS_PATH = '/homenfs/a.pedrotti1/embeddings/MUSE' + print('Running generalized funnelling...') - _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' - EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' - data = MultilingualDataset.load(_DATASET) + data = MultilingualDataset.load(args.dataset) data.set_view(languages=['it', 'fr']) data.show_dimensions() lX, ly = data.training() lXte, lyte = data.test() # Init multilingualIndex - mandatory when deploying Neural View Generators... - multilingualIndex = MultilingualIndex() - lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) - multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) + if args.gru_embedder or args.bert_embedder: + multilingualIndex = MultilingualIndex() + lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) + multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) embedder_list = [] - if args.X: - posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) + if args.post_embedder: + posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) embedder_list.append(posteriorEmbedder) - if args.M: - museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS) + if args.muse_embedder: + museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) embedder_list.append(museEmbedder) - if args.W: - wceEmbedder = WordClassGen(n_jobs=N_JOBS) + if args.wce_embedder: + wceEmbedder = WordClassGen(n_jobs=args.n_jobs) embedder_list.append(wceEmbedder) - if args.G: - rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256, - nepochs=250, gpus=args.gpus, n_jobs=N_JOBS) + if args.gru_embedder: + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, + nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) - if args.B: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS) + if args.bert_embedder: + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) + bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) - # Init DocEmbedderList + # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) - meta_parameters = None if not OPTIMC else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), - meta_parameters=get_params(optimc=OPTIMC)) + meta_parameters=get_params(optimc=args.optimc)) # Init Funnelling Architecture gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) @@ -78,39 +75,93 @@ def main(args): # Logging --------------------------------------- print('\n[Results]') - results = CSVlog('test_log.csv') + results = CSVlog(args.csv_dir) metrics = [] for lang in lXte.keys(): macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') - results.add_row(method='gfun', - setting='TODO', - sif='True', - zscore='True', - l2='True', - dataset='TODO', - time_tr=time_tr, - time_te=time_te, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') + if results is not None: + _id, _dataset = get_method_name(args) + results.add_row(method='gfun', + setting=_id, + optimc=args.optimc, + sif='True', + zscore='True', + l2='True', + dataset=_dataset, + time_tr=time_tr, + time_te=time_te, + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) overall_time = round(time() - time_init, 3) - exit(f'\nExecuted in: {overall_time } seconds!') + exit(f'\nExecuted in: {overall_time} seconds!') if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument('--X') - parser.add_argument('--M') - parser.add_argument('--W') - parser.add_argument('--G') - parser.add_argument('--B') - parser.add_argument('--gpus', default=None) + parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani') + + parser.add_argument('dataset', help='Path to the dataset') + + parser.add_argument('-o', '--output', dest='csv_dir', + help='Result file (default ../csv_log/gfun_results.csv)', type=str, + default='csv_logs/gfun/gfun_results.csv') + + parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', + help='deploy posterior probabilities embedder to compute document embeddings', + default=False) + + parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true', + help='deploy (supervised) Word-Class embedder to the compute document embeddings', + default=False) + + parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true', + help='deploy (pretrained) MUSE embedder to compute document embeddings', + default=False) + + parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true', + help='deploy multilingual Bert to compute document embeddings', + default=False) + + parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', + help='deploy a GRU in order to compute document embeddings', + default=False) + + parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', + help='Optimize SVMs C hyperparameter', + default=False) + + parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, + help='Number of max epochs to train Recurrent embedder (i.e., -g)') + + parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, + help='Number of parallel jobs (default is -1, all)', + default=-1) + + parser.add_argument('--muse_dir', dest='muse_dir', type=str, + help='Path to the MUSE polylingual word embeddings (default ../embeddings)', + default='../embeddings') + + parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', + help='Deploy WCE embedding as embedding layer of the GRU View Generator', + default=False) + + parser.add_argument('--gru_dir', dest='gru_dir', type=str, + help='Set the path to a pretrained GRU model (i.e., -g view generator)', + default=None) + + parser.add_argument('--bert_dir', dest='bert_dir', type=str, + help='Set the path to a pretrained mBERT model (i.e., -b view generator)', + default=None) + + parser.add_argument('--gpus', help='specifies how many GPUs to use per node', + default=None) + args = parser.parse_args() main(args) diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 11fe0ce..67f37f4 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -161,7 +161,7 @@ class BertModel(pl.LightningModule): else: batch = lX[lang][i:i + batch_size] max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index='101', max_pad_length=max_pad_len) # TODO: check pad index! + batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') _, output = self.forward(batch) doc_embeds = output[-1][:, 0, :] diff --git a/refactor/util/common.py b/refactor/util/common.py index a624528..0cd95e6 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -369,3 +369,16 @@ def get_params(optimc=False): c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] kernel = 'rbf' return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + + +def get_method_name(args): + _id = '' + _id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder] + _id_name = ['X', 'W', 'M', 'B', 'G'] + for i, conf in enumerate(_id_conf): + if conf: + _id += _id_name[i] + _id = _id if not args.gru_wce else _id + '_wce' + _dataset_path = args.dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + return _id, dataset_id diff --git a/refactor/util/file.py b/refactor/util/file.py index a3d0a3a..98c9910 100644 --- a/refactor/util/file.py +++ b/refactor/util/file.py @@ -1,6 +1,5 @@ from os import listdir, makedirs from os.path import isdir, isfile, join, exists, dirname -#from sklearn.externals.six.moves import urllib import urllib from pathlib import Path @@ -14,6 +13,7 @@ def download_file(url, archive_filename): urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) print("") + def download_file_if_not_exists(url, archive_path): if exists(archive_path): return makedirs_if_not_exist(dirname(archive_path)) @@ -25,20 +25,26 @@ def ls(dir, typecheck): el.sort() return el + def list_dirs(dir): return ls(dir, typecheck=isdir) + def list_files(dir): return ls(dir, typecheck=isfile) + def makedirs_if_not_exist(path): if not exists(path): makedirs(path) + def create_if_not_exist(path): if not exists(path): makedirs(path) + def get_parent_name(path): return Path(path).parent + def get_file_name(path): return Path(path).name diff --git a/refactor/util/results_csv.py b/refactor/util/results_csv.py index 85a7de1..df80c59 100644 --- a/refactor/util/results_csv.py +++ b/refactor/util/results_csv.py @@ -8,6 +8,7 @@ class CSVlog: self.file = file self.columns = ['method', 'setting', + 'optimc', 'sif', 'zscore', 'l2', @@ -34,9 +35,9 @@ class CSVlog: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, method, setting, sif, zscore, l2, dataset, time_tr, time_te, lang, + def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, setting,sif, zscore, l2, dataset, time_tr, time_te, lang, + s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) From 5958df3e3c14fab2f7ffc06b6e70df54474691b0 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 12:49:28 +0100 Subject: [PATCH 33/55] typos + requirements.txt --- refactor/main.py | 1 - refactor/models/pl_bert.py | 4 +--- refactor/models/pl_gru.py | 5 +---- refactor/requirements.txt | 12 ++++++++++++ refactor/util/pl_metrics.py | 8 ++++---- refactor/view_generators.py | 14 ++++++-------- 6 files changed, 24 insertions(+), 20 deletions(-) create mode 100644 refactor/requirements.txt diff --git a/refactor/main.py b/refactor/main.py index bab9189..d043d76 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -46,7 +46,6 @@ def main(args): if args.bert_embedder: bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) - bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 67f37f4..48f5b9a 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -22,8 +22,7 @@ class BertModel(pl.LightningModule): self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics - I am not really sure if they should be initialized - # independently or we can use the metrics init above... # TODO: check it + # Language specific metrics to compute metrics at epoch level self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) @@ -71,7 +70,6 @@ class BertModel(pl.LightningModule): langs = set(langs) # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language - # TODO: make this a function (reused in pl_gru epoch_end) res_macroF1 = {lang: [] for lang in langs} res_microF1 = {lang: [] for lang in langs} res_macroK = {lang: [] for lang in langs} diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index ca4f8da..eaf7304 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -41,8 +41,7 @@ class RecurrentModel(pl.LightningModule): self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics - I am not really sure if they should be initialized - # independently or we can use the metrics init above... # TODO: check it + # Language specific metrics to compute metrics at epoch level self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) @@ -110,7 +109,6 @@ class RecurrentModel(pl.LightningModule): def encode(self, lX, l_pad, batch_size=128): """ Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. - # TODO: does not run on gpu.. :param lX: :param l_pad: :param batch_size: @@ -167,7 +165,6 @@ class RecurrentModel(pl.LightningModule): def training_epoch_end(self, outputs): # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. # here we save epoch level metric values and compute them specifically for each language - # TODO: this is horrible... res_macroF1 = {lang: [] for lang in self.langs} res_microF1 = {lang: [] for lang in self.langs} res_macroK = {lang: [] for lang in self.langs} diff --git a/refactor/requirements.txt b/refactor/requirements.txt new file mode 100644 index 0000000..4546a4a --- /dev/null +++ b/refactor/requirements.txt @@ -0,0 +1,12 @@ +transformers==2.11.0 +pandas==0.25.3 +numpy==1.17.4 +joblib==0.14.0 +tqdm==4.50.2 +pytorch_lightning==1.1.2 +torch==1.3.1 +nltk==3.4.5 +scipy==1.3.3 +rdflib==4.2.2 +torchtext==0.4.0 +scikit_learn==0.24.1 diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py index 6781d09..9b44eb0 100644 --- a/refactor/util/pl_metrics.py +++ b/refactor/util/pl_metrics.py @@ -102,10 +102,10 @@ class CustomK(Metric): specificity, recall = 0., 0. absolute_negatives = self.true_negative.sum() + self.false_positive.sum() if absolute_negatives != 0: - specificity = self.true_negative.sum()/absolute_negatives # Todo check if it is float + specificity = self.true_negative.sum()/absolute_negatives absolute_positives = self.true_positive.sum() + self.false_negative.sum() if absolute_positives != 0: - recall = self.true_positive.sum()/absolute_positives # Todo check if it is float + recall = self.true_positive.sum()/absolute_positives if absolute_positives == 0: return 2. * specificity - 1 @@ -125,10 +125,10 @@ class CustomK(Metric): specificity, recall = 0., 0. absolute_negatives = class_tn + class_fp if absolute_negatives != 0: - specificity = class_tn / absolute_negatives # Todo check if it is float + specificity = class_tn / absolute_negatives absolute_positives = class_tp + class_fn if absolute_positives != 0: - recall = class_tp / absolute_positives # Todo check if it is float + recall = class_tp / absolute_positives if absolute_positives == 0: class_specific.append(2. * specificity - 1) diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 2d82a20..e366d7d 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -1,18 +1,19 @@ """ This module contains the view generators that take care of computing the view specific document embeddings: -- VanillaFunGen (-X) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. +- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. -- WordClassGen (-W): generates document representation via Word-Class-Embeddings. +- WordClassGen (-w): generates document representation via Word-Class-Embeddings. Document embeddings are obtained via weighted sum of document's constituent embeddings. -- MuseGen (-M): +- MuseGen (-m): generates document representation via MUSE embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. -- RecurrentGen (-G): generates document embedding by means of a Gated Recurrent Units. The model can be +- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Output dimension is (n_docs, 512). -- View generator (-B): generates document embedding via mBERT model. +- View generator (-b): generates document embedding via mBERT model. """ from abc import ABC, abstractmethod from models.learners import * @@ -153,9 +154,6 @@ class WordClassGen(ViewGen): class RecurrentGen(ViewGen): - # TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5 - # Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint). - # if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint() def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): """ From 2a8075bbc227ff9f56527917a112382d262b1866 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 13:12:37 +0100 Subject: [PATCH 34/55] optimized imports --- refactor/data/datamodule.py | 4 ++-- refactor/data/dataset_builder.py | 28 +++++++++++++----------- refactor/data/reader/jrcacquis_reader.py | 21 ++++++++++-------- refactor/data/reader/rcv_reader.py | 17 ++++++-------- refactor/data/reader/wikipedia_tools.py | 13 ++++++----- refactor/data/text_preprocessor.py | 5 +++-- refactor/data/tsr_function__.py | 3 ++- refactor/funnelling.py | 2 +- refactor/main.py | 6 ++--- refactor/models/learners.py | 12 +++++----- refactor/models/lstm_class.py | 3 +-- refactor/models/pl_bert.py | 5 +++-- refactor/models/pl_gru.py | 7 +++--- refactor/util/common.py | 4 ++-- refactor/util/embeddings_manager.py | 6 +++-- refactor/util/evaluation.py | 5 +++-- refactor/util/file.py | 2 +- refactor/util/pl_metrics.py | 1 + refactor/util/results_csv.py | 3 ++- refactor/view_generators.py | 18 ++++++++------- 20 files changed, 91 insertions(+), 74 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 12d7e02..1121a58 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -1,7 +1,7 @@ -import torch -from torch.utils.data import Dataset, DataLoader import numpy as np import pytorch_lightning as pl +import torch +from torch.utils.data import Dataset, DataLoader from transformers import BertTokenizer N_WORKERS = 8 diff --git a/refactor/data/dataset_builder.py b/refactor/data/dataset_builder.py index b9650c7..0e91316 100644 --- a/refactor/data/dataset_builder.py +++ b/refactor/data/dataset_builder.py @@ -1,19 +1,21 @@ -from os.path import join, exists -from nltk.corpus import stopwords -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.preprocessing import MultiLabelBinarizer -from data.reader.jrcacquis_reader import * -from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents -import pickle -import numpy as np -from sklearn.model_selection import train_test_split -from scipy.sparse import issparse import itertools -from tqdm import tqdm +import pickle import re +from os.path import exists + +import numpy as np +from nltk.corpus import stopwords from scipy.sparse import csr_matrix +from scipy.sparse import issparse +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MultiLabelBinarizer +from tqdm import tqdm + +from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from data.reader.jrcacquis_reader import * +from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 +from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents class MultilingualDataset: diff --git a/refactor/data/reader/jrcacquis_reader.py b/refactor/data/reader/jrcacquis_reader.py index c0441ed..e911996 100644 --- a/refactor/data/reader/jrcacquis_reader.py +++ b/refactor/data/reader/jrcacquis_reader.py @@ -1,19 +1,22 @@ from __future__ import print_function -import os, sys -from os.path import join + +import os +import pickle +import sys import tarfile import xml.etree.ElementTree as ET -from sklearn.datasets import get_data_home -import pickle -from util.file import download_file, list_dirs, list_files +import zipfile +from collections import Counter +from os.path import join +from random import shuffle + import rdflib from rdflib.namespace import RDF, SKOS -from rdflib import URIRef -import zipfile +from sklearn.datasets import get_data_home + from data.languages import JRC_LANGS -from collections import Counter -from random import shuffle from data.languages import lang_set +from util.file import download_file, list_dirs, list_files """ JRC Acquis' Nomenclature: diff --git a/refactor/data/reader/rcv_reader.py b/refactor/data/reader/rcv_reader.py index cd4b416..b3db098 100644 --- a/refactor/data/reader/rcv_reader.py +++ b/refactor/data/reader/rcv_reader.py @@ -1,15 +1,12 @@ -from zipfile import ZipFile -import xml.etree.ElementTree as ET -from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS -from util.file import list_files -from sklearn.datasets import get_data_home -import gzip -from os.path import join, exists -from util.file import download_file_if_not_exists import re -from collections import Counter +import xml.etree.ElementTree as ET +from os.path import join, exists +from zipfile import ZipFile + import numpy as np -import sys + +from util.file import download_file_if_not_exists +from util.file import list_files """ RCV2's Nomenclature: diff --git a/refactor/data/reader/wikipedia_tools.py b/refactor/data/reader/wikipedia_tools.py index 83e11e3..9558fb6 100644 --- a/refactor/data/reader/wikipedia_tools.py +++ b/refactor/data/reader/wikipedia_tools.py @@ -1,16 +1,19 @@ from __future__ import print_function + # import ijson # from ijson.common import ObjectBuilder -import os, sys -from os.path import join -from bz2 import BZ2File +import os import pickle -from util.file import list_dirs, list_files, makedirs_if_not_exist -from itertools import islice import re +from bz2 import BZ2File +from itertools import islice +from os.path import join from xml.sax.saxutils import escape + import numpy as np +from util.file import list_dirs, list_files + policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] """ diff --git a/refactor/data/text_preprocessor.py b/refactor/data/text_preprocessor.py index 1a6e3ae..fcfddba 100644 --- a/refactor/data/text_preprocessor.py +++ b/refactor/data/text_preprocessor.py @@ -1,8 +1,9 @@ -from nltk.corpus import stopwords -from data.languages import NLTK_LANGMAP from nltk import word_tokenize +from nltk.corpus import stopwords from nltk.stem import SnowballStemmer +from data.languages import NLTK_LANGMAP + def preprocess_documents(documents, lang): tokens = NLTKStemTokenizer(lang, verbose=True) diff --git a/refactor/data/tsr_function__.py b/refactor/data/tsr_function__.py index 0af8690..c458029 100755 --- a/refactor/data/tsr_function__.py +++ b/refactor/data/tsr_function__.py @@ -1,8 +1,9 @@ import math + import numpy as np -from scipy.stats import t from joblib import Parallel, delayed from scipy.sparse import csr_matrix, csc_matrix +from scipy.stats import t def get_probs(tpr, fpr, pc): diff --git a/refactor/funnelling.py b/refactor/funnelling.py index 6c79ae9..4d19e1a 100644 --- a/refactor/funnelling.py +++ b/refactor/funnelling.py @@ -1,6 +1,6 @@ from models.learners import * -from view_generators import VanillaFunGen from util.common import _normalize +from view_generators import VanillaFunGen class DocEmbedderList: diff --git a/refactor/main.py b/refactor/main.py index d043d76..48936d0 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -1,11 +1,11 @@ from argparse import ArgumentParser -from funnelling import * -from view_generators import * + from data.dataset_builder import MultilingualDataset +from funnelling import * from util.common import MultilingualIndex, get_params, get_method_name from util.evaluation import evaluate from util.results_csv import CSVlog -from time import time +from view_generators import * def main(args): diff --git a/refactor/models/learners.py b/refactor/models/learners.py index 1c60072..2654109 100644 --- a/refactor/models/learners.py +++ b/refactor/models/learners.py @@ -1,10 +1,12 @@ -import numpy as np import time -from scipy.sparse import issparse -from sklearn.multiclass import OneVsRestClassifier -from sklearn.model_selection import GridSearchCV -from sklearn.svm import SVC + +import numpy as np from joblib import Parallel, delayed +from scipy.sparse import issparse +from sklearn.model_selection import GridSearchCV +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import SVC + from util.standardizer import StandardizeTransformer diff --git a/refactor/models/lstm_class.py b/refactor/models/lstm_class.py index 98424f1..7f2cf59 100755 --- a/refactor/models/lstm_class.py +++ b/refactor/models/lstm_class.py @@ -1,7 +1,6 @@ #taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -import torch -import torch.nn as nn from torch.autograd import Variable + from models.helpers import * diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py index 48f5b9a..afb28b5 100644 --- a/refactor/models/pl_bert.py +++ b/refactor/models/pl_bert.py @@ -1,9 +1,10 @@ -import torch import pytorch_lightning as pl +import torch from torch.optim.lr_scheduler import StepLR from transformers import BertForSequenceClassification, AdamW -from util.pl_metrics import CustomF1, CustomK + from util.common import define_pad_length, pad +from util.pl_metrics import CustomF1, CustomK class BertModel(pl.LightningModule): diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index eaf7304..afb12e6 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -1,14 +1,15 @@ # Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html +import pytorch_lightning as pl import torch -from torch import nn import torch.nn.functional as F +from torch import nn from torch.autograd import Variable from torch.optim.lr_scheduler import StepLR from transformers import AdamW -import pytorch_lightning as pl + from models.helpers import init_embeddings -from util.pl_metrics import CustomF1, CustomK from util.common import define_pad_length, pad +from util.pl_metrics import CustomF1, CustomK class RecurrentModel(pl.LightningModule): diff --git a/refactor/util/common.py b/refactor/util/common.py index 0cd95e6..61ac52f 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -1,9 +1,9 @@ import numpy as np import torch -from tqdm import tqdm from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.preprocessing import normalize from sklearn.model_selection import train_test_split +from sklearn.preprocessing import normalize + from util.embeddings_manager import supervised_embeddings_tfidf diff --git a/refactor/util/embeddings_manager.py b/refactor/util/embeddings_manager.py index c0aca54..1d708fa 100644 --- a/refactor/util/embeddings_manager.py +++ b/refactor/util/embeddings_manager.py @@ -1,7 +1,9 @@ -from torchtext.vocab import Vectors -import torch from abc import ABC, abstractmethod + import numpy as np +import torch +from torchtext.vocab import Vectors + from util.SIF_embed import remove_pc diff --git a/refactor/util/evaluation.py b/refactor/util/evaluation.py index 03c1792..010d0e9 100644 --- a/refactor/util/evaluation.py +++ b/refactor/util/evaluation.py @@ -1,6 +1,7 @@ -from joblib import Parallel, delayed -from util.metrics import * import numpy as np +from joblib import Parallel, delayed + +from util.metrics import * def evaluation_metrics(y, y_): diff --git a/refactor/util/file.py b/refactor/util/file.py index 98c9910..8754f5a 100644 --- a/refactor/util/file.py +++ b/refactor/util/file.py @@ -1,6 +1,6 @@ +import urllib from os import listdir, makedirs from os.path import isdir, isfile, join, exists, dirname -import urllib from pathlib import Path diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py index 9b44eb0..bf8aa99 100644 --- a/refactor/util/pl_metrics.py +++ b/refactor/util/pl_metrics.py @@ -1,5 +1,6 @@ import torch from pytorch_lightning.metrics import Metric + from util.common import is_false, is_true diff --git a/refactor/util/results_csv.py b/refactor/util/results_csv.py index df80c59..be0ff84 100644 --- a/refactor/util/results_csv.py +++ b/refactor/util/results_csv.py @@ -1,6 +1,7 @@ import os -import pandas as pd + import numpy as np +import pandas as pd class CSVlog: diff --git a/refactor/view_generators.py b/refactor/view_generators.py index e366d7d..6cdd4a9 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -16,16 +16,18 @@ This module contains the view generators that take care of computing the view sp - View generator (-b): generates document embedding via mBERT model. """ from abc import ABC, abstractmethod -from models.learners import * -from util.embeddings_manager import MuseLoader, XdotM, wce_matrix -from util.common import TfidfVectorizerMultilingual, _normalize -from models.pl_gru import RecurrentModel -from models.pl_bert import BertModel -from pytorch_lightning import Trainer -from data.datamodule import RecurrentDataModule, BertDataModule, tokenize -from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger from time import time +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import TensorBoardLogger + +from data.datamodule import RecurrentDataModule, BertDataModule, tokenize +from models.learners import * +from models.pl_bert import BertModel +from models.pl_gru import RecurrentModel +from util.common import TfidfVectorizerMultilingual, _normalize +from util.embeddings_manager import MuseLoader, XdotM, wce_matrix + class ViewGen(ABC): @abstractmethod From 30d2be245ce2eda087bde76de81b41168c6825a5 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 14:02:51 +0100 Subject: [PATCH 35/55] sketched out documentation --- refactor/data/datamodule.py | 32 +++++++-- refactor/funnelling.py | 34 +++++++++- refactor/main.py | 1 + refactor/view_generators.py | 128 ++++++++++++++++++++++++++++-------- 4 files changed, 160 insertions(+), 35 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 1121a58..da6ec92 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -88,14 +88,21 @@ class RecurrentDataset(Dataset): class RecurrentDataModule(pl.LightningDataModule): - def __init__(self, multilingualIndex, batchsize=64): + """ + Pytorch Lightning Datamodule to be deployed with RecurrentGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ + def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1): """ - Pytorch-lightning DataModule: https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html - :param multilingualIndex: - :param batchsize: + Init RecurrentDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading). """ self.multilingualIndex = multilingualIndex self.batchsize = batchsize + self.n_jobs = n_jobs super().__init__() def prepare_data(self, *args, **kwargs): @@ -128,15 +135,15 @@ class RecurrentDataModule(pl.LightningDataModule): lPad_index=self.multilingualIndex.l_pad()) def train_dataloader(self): - return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, collate_fn=self.training_dataset.collate_fn) def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, collate_fn=self.val_dataset.collate_fn) def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, collate_fn=self.test_dataset.collate_fn) @@ -156,7 +163,18 @@ def tokenize(l_raw, max_len): class BertDataModule(RecurrentDataModule): + """ + Pytorch Lightning Datamodule to be deployed with BertGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ def __init__(self, multilingualIndex, batchsize=64, max_len=512): + """ + Init BertDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param max_len: int, max number of token per document. Absolute cap is 512. + """ super().__init__(multilingualIndex, batchsize) self.max_len = max_len diff --git a/refactor/funnelling.py b/refactor/funnelling.py index 4d19e1a..812a937 100644 --- a/refactor/funnelling.py +++ b/refactor/funnelling.py @@ -4,9 +4,13 @@ from view_generators import VanillaFunGen class DocEmbedderList: + """ + Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be + contained by this class in order to seamlessly train the overall architecture. + """ def __init__(self, embedder_list, probabilistic=True): """ - Class that takes care of calling fit and transform function for every init embedder. + Init the DocEmbedderList. :param embedder_list: list of embedders to be deployed :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not """ @@ -23,11 +27,22 @@ class DocEmbedderList: self.embedders = _tmp def fit(self, lX, ly): + """ + Fit all the ViewGenerators contained by DocEmbedderList. + :param lX: + :param ly: + :return: self + """ for embedder in self.embedders: embedder.fit(lX, ly) return self def transform(self, lX): + """ + Project documents by means of every ViewGenerators. Projections are then averaged together and returned. + :param lX: + :return: common latent space (averaged). + """ langs = sorted(lX.keys()) lZparts = {lang: None for lang in langs} @@ -40,14 +55,24 @@ class DocEmbedderList: else: lZparts[lang] += Z n_embedders = len(self.embedders) - return {lang: lZparts[lang]/n_embedders for lang in langs} + return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces def fit_transform(self, lX, ly): return self.fit(lX, ly).transform(lX) class FeatureSet2Posteriors: + """ + Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of + a multiclass SVM. + """ def __init__(self, embedder, l2=True, n_jobs=-1): + """ + Init the class. + :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities. + :param l2: bool, whether to apply or not L2 normalization to the projection + :param n_jobs: int, number of concurrent workers. + """ self.embedder = embedder self.l2 = l2 self.n_jobs = n_jobs @@ -77,6 +102,11 @@ class FeatureSet2Posteriors: class Funnelling: + """ + Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders. + The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by + the first-tier learners. + """ def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): self.first_tier = first_tier self.meta = meta_classifier diff --git a/refactor/main.py b/refactor/main.py index 48936d0..ebc43a3 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -26,6 +26,7 @@ def main(args): lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) + # Init ViewGenerators and append them to embedder_list embedder_list = [] if args.post_embedder: posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 6cdd4a9..384ec76 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -30,6 +30,10 @@ from util.embeddings_manager import MuseLoader, XdotM, wce_matrix class ViewGen(ABC): + """ + Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to + be seamlessly integrated in the overall architecture. + """ @abstractmethod def fit(self, lX, ly): pass @@ -44,9 +48,13 @@ class ViewGen(ABC): class VanillaFunGen(ViewGen): + """ + View Generator (x): original funnelling architecture proposed by Moreo, Esuli and + Sebastiani in DOI: https://doi.org/10.1145/3326065 + """ def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): """ - Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 + Init Posterior Probabilities embedder (i.e., VanillaFunGen) :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to return posterior probabilities. :param base_learner: @@ -68,11 +76,10 @@ class VanillaFunGen(ViewGen): def transform(self, lX): """ - (1) Vectorize documents - (2) Project them according to the learners SVMs - (3) Apply L2 normalization to the projection - :param lX: - :return: + (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization + to the projection and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. """ lX = self.vectorizer.transform(lX) lZ = self.doc_projector.predict_proba(lX) @@ -84,10 +91,13 @@ class VanillaFunGen(ViewGen): class MuseGen(ViewGen): + """ + View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ def __init__(self, muse_dir='../embeddings', n_jobs=-1): """ - generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + Init the MuseGen. :param muse_dir: string, path to folder containing muse embeddings :param n_jobs: int, number of concurrent workers """ @@ -99,6 +109,12 @@ class MuseGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ print('# Fitting MuseGen (M)...') self.vectorizer.fit(lX) self.langs = sorted(lX.keys()) @@ -109,6 +125,12 @@ class MuseGen(ViewGen): return self def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ lX = self.vectorizer.transform(lX) XdotMUSE = Parallel(n_jobs=self.n_jobs)( delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) @@ -121,10 +143,13 @@ class MuseGen(ViewGen): class WordClassGen(ViewGen): + """ + View Generator (w): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ def __init__(self, n_jobs=-1): """ - generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. + Init WordClassGen. :param n_jobs: int, number of concurrent workers """ super().__init__() @@ -134,6 +159,12 @@ class WordClassGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ print('# Fitting WordClassGen (W)...') lX = self.vectorizer.fit_transform(lX) self.langs = sorted(lX.keys()) @@ -144,6 +175,12 @@ class WordClassGen(ViewGen): return self def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ lX = self.vectorizer.transform(lX) XdotWce = Parallel(n_jobs=self.n_jobs)( delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) @@ -156,17 +193,28 @@ class WordClassGen(ViewGen): class RecurrentGen(ViewGen): + """ + View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns + the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. + """ def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): """ - generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). - :param multilingualIndex: - :param pretrained_embeddings: - :param wce: - :param gpus: - :param n_jobs: + Init RecurrentGen. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use + as embedding layer. + :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised + embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to + the number of target classes. + :param batch_size: int, number of samples in a batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. """ super().__init__() self.multilingualIndex = multilingualIndex @@ -212,14 +260,15 @@ class RecurrentGen(ViewGen): def fit(self, lX, ly): """ + Train the Neural Network end-to-end. lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation of the Dataset object (RecurrentDataset) in the GfunDataModule class. - :param lX: - :param ly: - :return: + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. """ print('# Fitting RecurrentGen (G)...') - recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size) + recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, checkpoint_callback=False) @@ -236,9 +285,9 @@ class RecurrentGen(ViewGen): def transform(self, lX): """ - Project documents to the common latent space - :param lX: - :return: + Project documents to the common latent space. Output dimensionality is 512. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. """ l_pad = self.multilingualIndex.l_pad() data = self.multilingualIndex.l_devel_index() @@ -255,7 +304,22 @@ class RecurrentGen(ViewGen): class BertGen(ViewGen): + """ + View Generator (b): generates document embedding via Bert model. The training happens end-to-end. + At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document + embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. + """ def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): + """ + Init Bert model + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batch_size: int, number of samples per batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers. + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. + """ super().__init__() self.multilingualIndex = multilingualIndex self.nepochs = nepochs @@ -271,6 +335,14 @@ class BertGen(ViewGen): return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) def fit(self, lX, ly): + """ + Train the Neural Network end-to-end. + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ print('# Fitting BertGen (M)...') self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) @@ -281,7 +353,11 @@ class BertGen(ViewGen): return self def transform(self, lX): - # lX is raw text data. It has to be first indexed via Bert Tokenizer. + """ + Project documents to the common latent space. Output dimensionality is 768. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. + """ data = self.multilingualIndex.l_devel_raw_index() data = tokenize(data, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') From a5912a22a993560c1dfd732ed2b218d7e8f3e995 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 15:10:32 +0100 Subject: [PATCH 36/55] merged with refactor --- src/data/__init__.py | 0 src/data/datamodule.py | 222 +++++++++ src/data/dataset_builder.py | 712 ++++++++++++++++++++++++++++ src/data/languages.py | 42 ++ src/data/reader/__init__.py | 0 src/data/reader/jrcacquis_reader.py | 324 +++++++++++++ src/data/reader/rcv_reader.py | 222 +++++++++ src/data/reader/wikipedia_tools.py | 307 ++++++++++++ src/data/text_preprocessor.py | 34 ++ src/data/tsr_function__.py | 271 +++++++++++ src/funnelling.py | 124 +++++ src/main.py | 167 +++++++ src/models/helpers.py | 51 ++ src/models/learners.py | 224 +++++++++ src/models/lstm_class.py | 113 +++++ src/models/pl_bert.py | 183 +++++++ src/models/pl_gru.py | 266 +++++++++++ src/requirements.txt | 12 + src/run.sh | 6 + src/util/SIF_embed.py | 59 +++ src/util/common.py | 384 +++++++++++++++ src/util/embeddings_manager.py | 104 ++++ src/util/evaluation.py | 20 + src/util/file.py | 50 ++ src/util/metrics.py | 152 ++++++ src/util/pl_metrics.py | 141 ++++++ src/util/results_csv.py | 53 +++ src/util/standardizer.py | 36 ++ src/view_generators.py | 375 +++++++++++++++ 29 files changed, 4654 insertions(+) create mode 100644 src/data/__init__.py create mode 100644 src/data/datamodule.py create mode 100644 src/data/dataset_builder.py create mode 100644 src/data/languages.py create mode 100644 src/data/reader/__init__.py create mode 100644 src/data/reader/jrcacquis_reader.py create mode 100644 src/data/reader/rcv_reader.py create mode 100644 src/data/reader/wikipedia_tools.py create mode 100644 src/data/text_preprocessor.py create mode 100755 src/data/tsr_function__.py create mode 100644 src/funnelling.py create mode 100644 src/main.py create mode 100755 src/models/helpers.py create mode 100644 src/models/learners.py create mode 100755 src/models/lstm_class.py create mode 100644 src/models/pl_bert.py create mode 100644 src/models/pl_gru.py create mode 100644 src/requirements.txt create mode 100644 src/run.sh create mode 100644 src/util/SIF_embed.py create mode 100644 src/util/common.py create mode 100644 src/util/embeddings_manager.py create mode 100644 src/util/evaluation.py create mode 100644 src/util/file.py create mode 100644 src/util/metrics.py create mode 100644 src/util/pl_metrics.py create mode 100644 src/util/results_csv.py create mode 100644 src/util/standardizer.py create mode 100644 src/view_generators.py diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/datamodule.py b/src/data/datamodule.py new file mode 100644 index 0000000..da6ec92 --- /dev/null +++ b/src/data/datamodule.py @@ -0,0 +1,222 @@ +import numpy as np +import pytorch_lightning as pl +import torch +from torch.utils.data import Dataset, DataLoader +from transformers import BertTokenizer + +N_WORKERS = 8 + + +class RecurrentDataset(Dataset): + def __init__(self, lX, ly, lPad_index): + """ + :param lX: dict {lang_id : np.ndarray} + :param ly: + """ + self.lX = [] + self.ly = [] + self.lOffset = {} + self.lPad_index = lPad_index + + for lang, data in lX.items(): + offset = [len(self.lX)] + self.lX.extend(data) + offset.append(len(self.lX)) + self.lOffset[lang] = offset + + for lang, target in ly.items(): + self.ly.extend(target) + + def __len__(self): + return len(self.lX) + + def __getitem__(self, index): + X = self.lX[index] + y = self.ly[index] + return X, y, index, self._get_lang(index) + + def _get_lang(self, index): + for lang, l_range in self.lOffset.items(): + if index in range(l_range[0], l_range[1]): + return lang + + def collate_fn(self, data): + """ + Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch} + items sampled from the Dataset class. + :param data: + :return: + """ + lX_batch = {} + ly_batch = {} + current_lang = data[0][-1] + for d in data: + if d[-1] == current_lang: + if current_lang not in lX_batch.keys(): + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + else: + current_lang = d[-1] + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + + for lang in lX_batch.keys(): + lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], + max_pad_length=self.define_pad_length(lX_batch[lang])) + lX_batch[lang] = torch.LongTensor(lX_batch[lang]) + ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) + + return lX_batch, ly_batch + + @staticmethod + def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + @staticmethod + def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list + + +class RecurrentDataModule(pl.LightningDataModule): + """ + Pytorch Lightning Datamodule to be deployed with RecurrentGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ + def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1): + """ + Init RecurrentDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading). + """ + self.multilingualIndex = multilingualIndex + self.batchsize = batchsize + self.n_jobs = n_jobs + super().__init__() + + def prepare_data(self, *args, **kwargs): + pass + + def setup(self, stage=None): + if stage == 'fit' or stage is None: + l_train_index, l_train_target = self.multilingualIndex.l_train() + # Debug settings: reducing number of samples + l_train_index = {l: train[:5] for l, train in l_train_index.items()} + l_train_target = {l: target[:5] for l, target in l_train_target.items()} + + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + + l_val_index, l_val_target = self.multilingualIndex.l_val() + # Debug settings: reducing number of samples + l_val_index = {l: train[:5] for l, train in l_val_index.items()} + l_val_target = {l: target[:5] for l, target in l_val_target.items()} + + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + if stage == 'test' or stage is None: + l_test_index, l_test_target = self.multilingualIndex.l_test() + # Debug settings: reducing number of samples + l_test_index = {l: train[:5] for l, train in l_test_index.items()} + l_test_target = {l: target[:5] for l, target in l_test_target.items()} + + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, + lPad_index=self.multilingualIndex.l_pad()) + + def train_dataloader(self): + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + collate_fn=self.training_dataset.collate_fn) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + collate_fn=self.val_dataset.collate_fn) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + collate_fn=self.test_dataset.collate_fn) + + +def tokenize(l_raw, max_len): + """ + run Bert tokenization on dict {lang: list of samples}. + :param l_raw: + :param max_len: + :return: + """ + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + l_tokenized = {} + for lang in l_raw.keys(): + output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') + l_tokenized[lang] = output_tokenizer['input_ids'] + return l_tokenized + + +class BertDataModule(RecurrentDataModule): + """ + Pytorch Lightning Datamodule to be deployed with BertGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ + def __init__(self, multilingualIndex, batchsize=64, max_len=512): + """ + Init BertDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param max_len: int, max number of token per document. Absolute cap is 512. + """ + super().__init__(multilingualIndex, batchsize) + self.max_len = max_len + + def setup(self, stage=None): + if stage == 'fit' or stage is None: + l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() + # Debug settings: reducing number of samples + l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} + l_train_target = {l: target[:5] for l, target in l_train_target.items()} + + l_train_index = tokenize(l_train_raw, max_len=self.max_len) + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + + l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() + # Debug settings: reducing number of samples + l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} + l_val_target = {l: target[:5] for l, target in l_val_target.items()} + + l_val_index = tokenize(l_val_raw, max_len=self.max_len) + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + + if stage == 'test' or stage is None: + l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() + # Debug settings: reducing number of samples + l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} + l_test_target = {l: target[:5] for l, target in l_test_target.items()} + + l_test_index = tokenize(l_test_raw, max_len=self.max_len) + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, + lPad_index=self.multilingualIndex.l_pad()) + + def train_dataloader(self): + """ + NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" + :return: + """ + return DataLoader(self.training_dataset, batch_size=self.batchsize) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize) diff --git a/src/data/dataset_builder.py b/src/data/dataset_builder.py new file mode 100644 index 0000000..0e91316 --- /dev/null +++ b/src/data/dataset_builder.py @@ -0,0 +1,712 @@ +import itertools +import pickle +import re +from os.path import exists + +import numpy as np +from nltk.corpus import stopwords +from scipy.sparse import csr_matrix +from scipy.sparse import issparse +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MultiLabelBinarizer +from tqdm import tqdm + +from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from data.reader.jrcacquis_reader import * +from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 +from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents + + +class MultilingualDataset: + """ + A multilingual dataset is a dictionary of training and test documents indexed by language code. + Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the + documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the + labels of each document, and ids is a list of document-identifiers from the original collection. + """ + + def __init__(self): + self.dataset_name = "" + self.multiling_dataset = {} + + def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): + self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) + + def save(self, file): + self.sort_indexes() + pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) + return self + + def __getitem__(self, item): + if item in self.langs(): + return self.multiling_dataset[item] + return None + + @classmethod + def load(cls, file): + data = pickle.load(open(file, 'rb')) + data.sort_indexes() + return data + + @classmethod + def load_ids(cls, file): + data = pickle.load(open(file, 'rb')) + tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} + te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} + return tr_ids, te_ids + + def sort_indexes(self): + for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): + if issparse(Xtr): Xtr.sort_indices() + if issparse(Xte): Xte.sort_indices() + + def set_view(self, categories=None, languages=None): + if categories is not None: + if isinstance(categories, int): + categories = np.array([categories]) + elif isinstance(categories, list): + categories = np.array(categories) + self.categories_view = categories + if languages is not None: + self.languages_view = languages + + def training(self, mask_numbers=False, target_as_csr=False): + return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) + + def test(self, mask_numbers=False, target_as_csr=False): + return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) + + def lXtr(self, mask_numbers=False): + proc = lambda x:_mask_numbers(x) if mask_numbers else x + # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + + def lXte(self, mask_numbers=False): + proc = lambda x: _mask_numbers(x) if mask_numbers else x + # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} + + def lYtr(self, as_csr=False): + lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY + + def lYte(self, as_csr=False): + lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY + + def cat_view(self, Y): + if hasattr(self, 'categories_view'): + return Y[:,self.categories_view] + else: + return Y + + def langs(self): + if hasattr(self, 'languages_view'): + langs = self.languages_view + else: + langs = sorted(self.multiling_dataset.keys()) + return langs + + def num_categories(self): + return self.lYtr()[self.langs()[0]].shape[1] + + def show_dimensions(self): + def shape(X): + return X.shape if hasattr(X, 'shape') else len(X) + for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): + if lang not in self.langs(): continue + print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) + + def show_category_prevalences(self): + nC = self.num_categories() + accum_tr = np.zeros(nC, dtype=np.int) + accum_te = np.zeros(nC, dtype=np.int) + in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) + for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): + if lang not in self.langs(): continue + prev_train = np.sum(self.cat_view(Ytr), axis=0) + prev_test = np.sum(self.cat_view(Yte), axis=0) + accum_tr += prev_train + accum_te += prev_test + in_langs += (prev_train>0)*1 + print(lang+'-train', prev_train) + print(lang+'-test', prev_test) + print('all-train', accum_tr) + print('all-test', accum_te) + + return accum_tr, accum_te, in_langs + + def set_labels(self, labels): + self.labels = labels + +def _mask_numbers(data): + mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') + mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') + mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') + mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') + mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + text = ' ' + text + text = mask_moredigit.sub(' MoreDigitMask', text) + text = mask_4digit.sub(' FourDigitMask', text) + text = mask_3digit.sub(' ThreeDigitMask', text) + text = mask_2digit.sub(' TwoDigitMask', text) + text = mask_1digit.sub(' OneDigitMask', text) + masked.append(text.replace('.','').replace(',','').strip()) + return masked + + + + +# ---------------------------------------------------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------------------------------------------------- +def get_active_labels(doclist): + cat_list = set() + for d in doclist: + cat_list.update(d.categories) + return list(cat_list) + +def filter_by_categories(doclist, keep_categories): + catset = frozenset(keep_categories) + for d in doclist: + d.categories = list(set(d.categories).intersection(catset)) + +def __years_to_str(years): + if isinstance(years, list): + if len(years) > 1: + return str(years[0])+'-'+str(years[-1]) + return str(years[0]) + return str(years) + + +# ---------------------------------------------------------------------------------------------------------------------- +# Matrix builders +# ---------------------------------------------------------------------------------------------------------------------- +def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): + """ + Builds the document-by-term weighted matrices for each language. Representations are independent of each other, + i.e., each language-specific matrix lies in a dedicate feature space. + :param dataset_name: the name of the dataset (str) + :param langs: list of languages (str) + :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param label_names: list of names of labels (str) + :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages + :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) + :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes + by language the processed wikipedia documents in their respective language-specific feature spaces + """ + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + lW = {} + + multilingual_dataset = MultilingualDataset() + multilingual_dataset.dataset_name = dataset_name + multilingual_dataset.set_labels(mlb.classes_) + for lang in langs: + print("\nprocessing %d training, %d test, %d wiki for language <%s>" % + (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) + + tr_data, tr_labels, IDtr = zip(*training_docs[lang]) + te_data, te_labels, IDte = zip(*test_docs[lang]) + + if preprocess: + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, + tokenizer=NLTKStemTokenizer(lang, verbose=True), + stop_words=stopwords.words(NLTK_LANGMAP[lang])) + else: + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) + + Xtr = tfidf.fit_transform(tr_data) + Xte = tfidf.transform(te_data) + if wiki_docs: + lW[lang] = tfidf.transform(wiki_docs[lang]) + + Ytr = mlb.transform(tr_labels) + Yte = mlb.transform(te_labels) + + multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + multilingual_dataset.show_dimensions() + multilingual_dataset.show_category_prevalences() + + if wiki_docs: + return multilingual_dataset, lW + else: + return multilingual_dataset + + +# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space +def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): + """ + Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, + since all of them lie on the same yuxtaposed feature space. + :param dataset_name: the name of the dataset (str) + :param langs: list of languages (str) + :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param label_names: list of names of labels (str) + :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) + :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes + by language the processed wikipedia documents in their respective language-specific feature spaces + """ + + multiling_dataset = MultilingualDataset() + multiling_dataset.dataset_name = dataset_name + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + multiling_dataset.set_labels(mlb.classes_) + + tr_data_stack = [] + for lang in langs: + print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) + tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) + te_data, te_labels, te_ID = zip(*test_docs[lang]) + if preprocess: + tr_data = preprocess_documents(tr_data, lang) + te_data = preprocess_documents(te_data, lang) + tr_data_stack.extend(tr_data) + multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) + + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) + tfidf.fit(tr_data_stack) + + for lang in langs: + print("\nweighting documents for language <%s>" % (lang)) + (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] + Xtr = tfidf.transform(tr_data) + Xte = tfidf.transform(te_data) + Ytr = mlb.transform(tr_labels) + Yte = mlb.transform(te_labels) + multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) + + multiling_dataset.show_dimensions() + return multiling_dataset + + +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to recover the original documents from the MultilingualDataset's ids +# ---------------------------------------------------------------------------------------------------------------------- +""" +This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent +article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents +from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath +""" +def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): + + tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) + assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' + langs = list(tr_ids.keys()) + + print('fetching the datasets') + rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + all_docs = rcv1_documents + rcv2_documents + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + for lang in langs: + analyzer = CountVectorizer(strip_accents='unicode', min_df=3, + stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + dataset.save(outpath) + +""" +Same thing but for JRC-Acquis +""" +def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): + + tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) + assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' + langs = list(tr_ids.keys()) + + print('fetching the datasets') + + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, + cat_filter=cat_list, cat_threshold=1, parallel=None, + most_frequent=most_common_cat) + test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, + parallel='force') + + def filter_by_id(doclist, ids): + ids_set = frozenset(itertools.chain.from_iterable(ids.values())) + return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] + + training_docs = filter_by_id(training_docs, tr_ids) + test_docs = filter_by_id(test_docs, te_ids) + + print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + for lang in langs: + analyzer = CountVectorizer(strip_accents='unicode', min_df=3, + stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + dataset.save(outpath) + +# ---------------------------------------------------------------------------------------------------------------------- +# Dataset Generators +# ---------------------------------------------------------------------------------------------------------------------- +def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + + + """ + Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the + "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. + In all cases, training documents are strictly non-parallel, and test documents are strictly parallel + :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where + all splits will be generated + :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) + :param langs: the list of languages to consider (as defined in data/languages.py) + :param train_years: a list of ints containing the years to be considered as training documents + :param test_years: a list of ints containing the years to be considered as test documents + :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" + (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the + leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details + :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all + :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) + :param run: a numeric label naming the random split (useful to keep track of different runs) + :return: None + """ + + name = 'JRCacquis' + run = '_run' + str(run) + config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ + 'vs' + __years_to_str(test_years) + \ + '_' + cat_policy + \ + ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ + '_noparallel_processed' + + indep_path = join(jrc_data_home, config_name + run + '.pickle') + upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') + yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') + wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') + wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') + + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, + cat_filter=cat_list, cat_threshold=1, parallel=None, + most_frequent=most_common_cat) + test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, + parallel='force') + + print('Generating feature-independent dataset...') + training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) + + def _group_by_lang(doc_list, langs): + return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] + for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) + test_docs = _group_by_lang(test_docs, langs) + if not exists(indep_path): + wiki_docs=None + if max_wiki>0: + if not exists(wiki_docs_path): + wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + + if wiki_docs: + lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) + pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) + + lang_data.save(indep_path) + + print('Generating upper-bound (English-only) dataset...') + if not exists(upper_path): + training_docs_eng_only = {'en':training_docs['en']} + test_docs_eng_only = {'en':test_docs['en']} + build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) + + print('Generating yuxtaposed dataset...') + if not exists(yuxta_path): + build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) + + +def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, + train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + """ + Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the + "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. + + :param outpath: path where all splits will be dumped + :param rcv1_data_home: path to the RCV1-v2 dataset (English only) + :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) + :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) + :param langs: the list of languages to consider (as defined in data/languages.py) + :param train_for_lang: maximum number of training documents per language + :param test_for_lang: maximum number of test documents per language + :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) + :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) + :param run: a numeric label naming the random split (useful to keep track of different runs) + :return: None + """ + + assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' + assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' + assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ + "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" + + name = 'RCV1/2' + run = '_run' + str(run) + config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ + ('_processed' if preprocess else '_raw') + + indep_path = join(outpath, config_name + run + '.pickle') + upper_path = join(outpath, config_name + run +'_upper.pickle') + yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') + wiki_path = join(outpath, config_name + run + '.wiki.pickle') + wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') + + print('fetching the datasets') + rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) + filter_by_categories(rcv1_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_documents+rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} + + # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there + # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases + print('Generating upper-bound (English-only) dataset...') + train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) + train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} + test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} + build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) + + train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] + for lang in langs: + if lang=='en': continue # already split + test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) + train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) + train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] + test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] + + print('Generating feature-independent dataset...') + wiki_docs=None + if max_wiki>0: + if not exists(wiki_docs_path): + wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + + if wiki_docs: + lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) + pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) + + lang_data.save(indep_path) + + print('Generating yuxtaposed dataset...') + build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) + + +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to generate full RCV and JRC datasets +# ---------------------------------------------------------------------------------------------------------------------- +def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): + + + print('fetching the datasets') + rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_train_documents, labels_rcv2) + filter_by_categories(rcv1_test_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_train_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents + lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} + + def get_ids(doclist): + return frozenset([d.id for d in doclist]) + + tr_ids = {'en': get_ids(rcv1_train_documents)} + te_ids = {'en': get_ids(rcv1_test_documents)} + for lang in langs: + if lang == 'en': continue + tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) + + dataset = MultilingualDataset() + dataset.dataset_name = 'RCV1/2-full' + for lang in langs: + print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): + + print('fetching the datasets') + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat + ) + test_docs, _ = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' + ) + + def _group_by_lang(doc_list, langs): + return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + test_docs = _group_by_lang(test_docs, langs) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + data.dataset_name = 'JRC-Acquis-full' + for lang in langs: + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) + Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +#----------------------------------------------------------------------------------------------------------------------- +# MAIN BUILDER +#----------------------------------------------------------------------------------------------------------------------- + +if __name__=='__main__': + import sys + RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = '../Datasets/RCV2' + JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" + full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) + # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) + sys.exit(0) + + # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' + # data = MultilingualDataset.load(datasetpath) + # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' + # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: + # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] + # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) + # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') + # sys.exit(0) + + assert len(sys.argv) == 5, "wrong number of arguments; required: " \ + " " + + JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" + RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' + WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" + + langs = lang_set['JRC_NLTK'] + max_wiki = 5000 + + for run in range(0,10): + print('Building JRC-Acquis datasets run', run) + prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, + train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, + cat_policy='all', most_common_cat=300, run=run) + + print('Building RCV1-v2/2 datasets run', run) + prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], + train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) + + # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE + # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) + # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) + # outpath = datasetpath.replace('_nltk_','_doclist_') + # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) + + # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) + # outpath = datasetpath.replace('_nltk_', '_doclist_') + # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) + + + diff --git a/src/data/languages.py b/src/data/languages.py new file mode 100644 index 0000000..2d03d8e --- /dev/null +++ b/src/data/languages.py @@ -0,0 +1,42 @@ +""" +bg = Bulgarian +cs = Czech +da = Danish +de = German +el = Greek +en = English +es = Spanish +et = Estonian +fi = Finnish +fr = French +hu = Hungarian +it = Italian +lt = Lithuanian +lv = Latvian +nl = Dutch +mt = Maltese +pl = Polish +pt = Portuguese +ro = Romanian +sk = Slovak +sl = Slovene +sv = Swedish +""" + +NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', + 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} + + +#top 10 languages in wikipedia order by the number of articles +#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] + +#all languages in JRC-acquis v3 +JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] +JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' + +RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] +RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] + +lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, + 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} + diff --git a/src/data/reader/__init__.py b/src/data/reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py new file mode 100644 index 0000000..e911996 --- /dev/null +++ b/src/data/reader/jrcacquis_reader.py @@ -0,0 +1,324 @@ +from __future__ import print_function + +import os +import pickle +import sys +import tarfile +import xml.etree.ElementTree as ET +import zipfile +from collections import Counter +from os.path import join +from random import shuffle + +import rdflib +from rdflib.namespace import RDF, SKOS +from sklearn.datasets import get_data_home + +from data.languages import JRC_LANGS +from data.languages import lang_set +from util.file import download_file, list_dirs, list_files + +""" +JRC Acquis' Nomenclature: +bg = Bulgarian +cs = Czech +da = Danish +de = German +el = Greek +en = English +es = Spanish +et = Estonian +fi = Finnish +fr = French +hu = Hungarian +it = Italian +lt = Lithuanian +lv = Latvian +nl = Dutch +mt = Maltese +pl = Polish +pt = Portuguese +ro = Romanian +sk = Slovak +sl = Slovene +sv = Swedish +""" + +class JRCAcquis_Document: + def __init__(self, id, name, lang, year, head, body, categories): + self.id = id + self.parallel_id = name + self.lang = lang + self.year = year + self.text = body if not head else head + "\n" + body + self.categories = categories + +# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles +# however, it seems that the title is often appearing as the first paragraph in the text/body (with +# standard codification), so it might be preferable not to read the header after all (as here by default) +def _proc_acute(text): + for ch in ['a','e','i','o','u']: + text = text.replace('%'+ch+'acute%',ch) + return text + +def parse_document(file, year, head=False): + root = ET.parse(file).getroot() + + doc_name = root.attrib['n'] # e.g., '22006A0211(01)' + doc_lang = root.attrib['lang'] # e.g., 'es' + doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' + doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] + doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' + doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) + + def raise_if_empty(field, from_file): + if isinstance(field, str): + if not field.strip(): + raise ValueError("Empty field in file %s" % from_file) + + raise_if_empty(doc_name, file) + raise_if_empty(doc_lang, file) + raise_if_empty(doc_id, file) + if head: raise_if_empty(doc_head, file) + raise_if_empty(doc_body, file) + + return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) + +# removes documents without a counterpart in all other languages +def _force_parallel(doclist, langs): + n_langs = len(langs) + par_id_count = Counter([d.parallel_id for d in doclist]) + parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) + return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] + +def random_sampling_avoiding_parallel(doclist): + random_order = list(range(len(doclist))) + shuffle(random_order) + sampled_request = [] + parallel_ids = set() + for ind in random_order: + pid = doclist[ind].parallel_id + if pid not in parallel_ids: + sampled_request.append(doclist[ind]) + parallel_ids.add(pid) + print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) + return sampled_request + + +#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter +def _filter_by_category(doclist, cat_filter): + if not isinstance(cat_filter, frozenset): + cat_filter = frozenset(cat_filter) + filtered = [] + for doc in doclist: + doc.categories = list(cat_filter & set(doc.categories)) + if doc.categories: + doc.categories.sort() + filtered.append(doc) + print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) + return filtered + +#filters out categories with less than cat_threshold documents (and filters documents containing those categories) +def _filter_by_frequency(doclist, cat_threshold): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +#select top most_frequent categories (and filters documents containing those categories) +def _most_common(doclist, most_frequent): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +def _get_categories(request): + final_cats = set() + for d in request: + final_cats.update(d.categories) + return list(final_cats) + +def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, + parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): + + assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' + if not langs: + langs = JRC_LANGS + else: + if isinstance(langs, str): langs = [langs] + for l in langs: + if l not in JRC_LANGS: + raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) + + if not data_path: + data_path = get_data_home() + + if not os.path.exists(data_path): + os.mkdir(data_path) + + request = [] + total_read = 0 + for l in langs: + file_name = 'jrc-'+l+'.tgz' + archive_path = join(data_path, file_name) + + if not os.path.exists(archive_path): + print("downloading language-specific dataset (once and for all) into %s" % data_path) + DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) + download_file(DOWNLOAD_URL, archive_path) + print("untarring dataset...") + tarfile.open(archive_path, 'r:gz').extractall(data_path) + + documents_dir = join(data_path, l) + + print("Reading documents...") + read = 0 + for dir in list_dirs(documents_dir): + year = int(dir) + if years==None or year in years: + year_dir = join(documents_dir,dir) + pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') + if os.path.exists(pickle_name): + print("loading from file %s" % pickle_name) + l_y_documents = pickle.load(open(pickle_name, "rb")) + read += len(l_y_documents) + else: + l_y_documents = [] + all_documents = list_files(year_dir) + empty = 0 + for i,doc_file in enumerate(all_documents): + try: + jrc_doc = parse_document(join(year_dir, doc_file), year) + except ValueError: + jrc_doc = None + + if jrc_doc and (not ignore_unclassified or jrc_doc.categories): + l_y_documents.append(jrc_doc) + else: empty += 1 + if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): + print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') + read+=1 + print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') + print("\t\t(Pickling object for future runs in %s)" % pickle_name) + pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) + request += l_y_documents + print("Read %d documents for language %s\n" % (read, l)) + total_read += read + print("Read %d documents in total" % (total_read)) + + if parallel=='force': + request = _force_parallel(request, langs) + elif parallel == 'avoid': + request = random_sampling_avoiding_parallel(request) + + final_cats = _get_categories(request) + + if cat_filter: + request = _filter_by_category(request, cat_filter) + final_cats = _get_categories(request) + if cat_threshold > 0: + request, final_cats = _filter_by_frequency(request, cat_threshold) + if most_frequent != -1 and len(final_cats) > most_frequent: + request, final_cats = _most_common(request, most_frequent) + + return request, final_cats + +def print_cat_analysis(request): + cat_count = Counter() + for d in request: + cat_count.update(d.categories) + print("Number of active categories: {}".format(len(cat_count))) + print(cat_count.most_common()) + +# inspects the Eurovoc thesaurus in order to select a subset of categories +# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented +def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', + eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", + select="broadest"): + + fullpath_pickle = join(data_path, select+'_concepts.pickle') + if os.path.exists(fullpath_pickle): + print("Pickled object found in %s. Loading it." % fullpath_pickle) + return pickle.load(open(fullpath_pickle,'rb')) + + fullpath = join(data_path, eurovoc_skos_core_concepts_filename) + if not os.path.exists(fullpath): + print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) + download_file(eurovoc_url, fullpath) + print("Unzipping file...") + zipped = zipfile.ZipFile(data_path + '.zip', 'r') + zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) + zipped.close() + + print("Parsing %s" %fullpath) + g = rdflib.Graph() + g.parse(location=fullpath, format="application/rdf+xml") + + if select == "all": + print("Selecting all concepts") + all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) + all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] + all_concepts.sort() + selected_concepts = all_concepts + elif select=="broadest": + print("Selecting broadest concepts (those without any other broader concept linked to it)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + narrower_concepts = set(g.subjects(SKOS.broader, None)) + broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] + broadest_concepts.sort() + selected_concepts = broadest_concepts + elif select=="leaves": + print("Selecting leaves concepts (those not linked as broader of any other concept)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + broad_concepts = set(g.objects(None, SKOS.broader)) + leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] + leave_concepts.sort() + selected_concepts = leave_concepts + else: + raise ValueError("Selection policy %s is not currently supported" % select) + + print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) + print("Pickling concept list for faster further requests in %s" % fullpath_pickle) + pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) + + return selected_concepts + +if __name__ == '__main__': + + def single_label_fragment(doclist): + single = [d for d in doclist if len(d.categories) < 2] + final_categories = set([d.categories[0] if d.categories else [] for d in single]) + print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), + len(final_categories), + len(doclist))) + return single, list(final_categories) + + train_years = list(range(1986, 2006)) + test_years = [2006] + cat_policy = 'leaves' + most_common_cat = 300 + # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" + JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" + langs = lang_set['JRC_NLTK'] + cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) + sys.exit() + + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) + test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') + + print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) + print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) + + training_docs, label_names = single_label_fragment(training_docs) + test_docs, label_namestest = single_label_fragment(test_docs) + + print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) + print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) + + diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py new file mode 100644 index 0000000..b3db098 --- /dev/null +++ b/src/data/reader/rcv_reader.py @@ -0,0 +1,222 @@ +import re +import xml.etree.ElementTree as ET +from os.path import join, exists +from zipfile import ZipFile + +import numpy as np + +from util.file import download_file_if_not_exists +from util.file import list_files + +""" +RCV2's Nomenclature: +ru = Russian +da = Danish +de = German +es = Spanish +lat = Spanish Latin-American (actually is also 'es' in the collection) +fr = French +it = Italian +nl = Dutch +pt = Portuguese +sv = Swedish +ja = Japanese +htw = Chinese +no = Norwegian +""" + +RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" +RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' +RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" +RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" + +rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', + 'lyrl2004_tokens_test_pt1.dat.gz', + 'lyrl2004_tokens_test_pt2.dat.gz', + 'lyrl2004_tokens_test_pt3.dat.gz'] + +rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] + +rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' + +RCV2_LANG_DIR = {'ru':'REUTE000', + 'de':'REUTE00A', + 'fr':'REUTE00B', + 'sv':'REUTE001', + 'no':'REUTE002', + 'da':'REUTE003', + 'pt':'REUTE004', + 'it':'REUTE005', + 'es':'REUTE006', + 'lat':'REUTE007', + 'jp':'REUTE008', + 'htw':'REUTE009', + 'nl':'REUTERS_'} + + +class RCV_Document: + + def __init__(self, id, text, categories, date='', lang=None): + self.id = id + self.date = date + self.lang = lang + self.text = text + self.categories = categories + + +class ExpectedLanguageException(Exception): pass +class IDRangeException(Exception): pass + + +nwords = [] + +def parse_document(xml_content, assert_lang=None, valid_id_range=None): + root = ET.fromstring(xml_content) + if assert_lang: + if assert_lang not in root.attrib.values(): + if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' + raise ExpectedLanguageException('error: document of a different language') + + doc_id = root.attrib['itemid'] + if valid_id_range is not None: + if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: + raise IDRangeException + + doc_categories = [cat.attrib['code'] for cat in + root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] + + doc_date = root.attrib['date'] + doc_title = root.find('.//title').text + doc_headline = root.find('.//headline').text + doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) + + if not doc_body: + raise ValueError('Empty document') + + if doc_title is None: doc_title = '' + if doc_headline is None or doc_headline in doc_title: doc_headline = '' + text = '\n'.join([doc_title, doc_headline, doc_body]).strip() + + text_length = len(text.split()) + global nwords + nwords.append(text_length) + + return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) + + +def fetch_RCV1(data_path, split='all'): + + assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' + + request = [] + labels = set() + read_documents = 0 + lang = 'en' + + training_documents = 23149 + test_documents = 781265 + + if split == 'all': + split_range = (2286, 810596) + expected = training_documents+test_documents + elif split == 'train': + split_range = (2286, 26150) + expected = training_documents + else: + split_range = (26151, 810596) + expected = test_documents + + global nwords + nwords=[] + for part in list_files(data_path): + if not re.match('\d+\.zip', part): continue + target_file = join(data_path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) + labels.update(doc.categories) + request.append(doc) + read_documents += 1 + except ValueError: + print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) + except (IDRangeException, ExpectedLanguageException) as e: + pass + print('\r[{}] read {} documents'.format(part, len(request)), end='') + if read_documents == expected: break + if read_documents == expected: break + print() + print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + return request, list(labels) + + +def fetch_RCV2(data_path, languages=None): + + if not languages: + languages = list(RCV2_LANG_DIR.keys()) + else: + assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' + + request = [] + labels = set() + global nwords + nwords=[] + for lang in languages: + path = join(data_path, RCV2_LANG_DIR[lang]) + lang_docs_read = 0 + for part in list_files(path): + target_file = join(path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, assert_lang=lang) + labels.update(doc.categories) + request.append(doc) + lang_docs_read += 1 + except ValueError: + print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) + except (IDRangeException, ExpectedLanguageException) as e: + pass + print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') + print() + print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + return request, list(labels) + + +def fetch_topic_hierarchy(path, topics='all'): + assert topics in ['all', 'leaves'] + + download_file_if_not_exists(RCV1_TOPICHIER_URL, path) + hierarchy = {} + for line in open(path, 'rt'): + parts = line.strip().split() + parent,child = parts[1],parts[3] + if parent not in hierarchy: + hierarchy[parent]=[] + hierarchy[parent].append(child) + + del hierarchy['None'] + del hierarchy['Root'] + print(hierarchy) + + if topics=='all': + topics = set(hierarchy.keys()) + for parent in hierarchy.keys(): + topics.update(hierarchy[parent]) + return list(topics) + elif topics=='leaves': + parents = set(hierarchy.keys()) + childs = set() + for parent in hierarchy.keys(): + childs.update(hierarchy[parent]) + return list(childs.difference(parents)) + + diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py new file mode 100644 index 0000000..9558fb6 --- /dev/null +++ b/src/data/reader/wikipedia_tools.py @@ -0,0 +1,307 @@ +from __future__ import print_function + +# import ijson +# from ijson.common import ObjectBuilder +import os +import pickle +import re +from bz2 import BZ2File +from itertools import islice +from os.path import join +from xml.sax.saxutils import escape + +import numpy as np + +from util.file import list_dirs, list_files + +policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] + +""" +This file contains a set of tools for processing the Wikipedia multilingual documents. +In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) +and have processed each document to clean their texts with one of the tools: + - https://github.com/aesuli/wikipediatools (Python 2) + - https://github.com/aesuli/wikipedia-extractor (Python 3) +It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) + +This tools help you in: + - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. + Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" + extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). + Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". + - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. + - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all + language-specific versions from the document. + - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, + in a way that the i-th element from any list refers to the same element in the respective language. +""" + +def _doc_generator(text_path, langs): + dotspace = re.compile(r'\.(?!\s)') + for l,lang in enumerate(langs): + print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) + lang_dir = join(text_path, lang) + split_dirs = list_dirs(lang_dir) + for sd,split_dir in enumerate(split_dirs): + print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) + split_files = list_files(join(lang_dir, split_dir)) + for sf,split_file in enumerate(split_files): + print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) + with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: + while True: + doc_lines = list(islice(fi, 3)) + if doc_lines: + # some sentences are not followed by a space after the dot + doc_lines[1] = dotspace.sub('. ', doc_lines[1]) + # [workaround] I found   html symbol was not treated, and unescaping it now might not help... + doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) + yield doc_lines, lang + else: break + +def _extract_title(doc_lines): + m = re.search('title="(.+?)"', doc_lines[0]) + if m: return m.group(1).decode('utf-8') + else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) + +def _create_doc(target_file, id, doc, lang): + doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) + with open(target_file, 'w') as fo: + fo.write('\n'%id) + [fo.write(line) for line in doc] + fo.write('') + +def _append_doc(target_file, doc, lang): + doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) + with open(target_file, 'r', buffering=1024*1024) as fi: + lines = fi.readlines() + if doc[0] in lines[1::3]: + return + lines[-1:-1]=doc + with open(target_file, 'w', buffering=1024*1024) as fo: + [fo.write(line) for line in lines] + +def extract_multilingual_documents(inv_dict, langs, text_path, out_path): + if not os.path.exists(out_path): + os.makedirs(out_path) + for lang in langs: + if lang not in inv_dict: + raise ValueError("Lang %s is not in the dictionary" % lang) + + docs_created = len(list_files(out_path)) + print("%d multilingual documents found." % docs_created) + for doc,lang in _doc_generator(text_path, langs): + title = _extract_title(doc) + + if title in inv_dict[lang]: + #pass + ids = inv_dict[lang][title] + for id in ids: + target_file = join(out_path, id) + ".xml" + if os.path.exists(target_file): + _append_doc(target_file, doc, lang) + else: + _create_doc(target_file, id, doc, lang) + docs_created+=1 + else: + if not re.match('[A-Za-z]+', title): + print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) + + + +def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): + simplified_file = join(data_dir,filename) + + if policy not in policies: + raise ValueError("Policy %s not supported." % policy) + print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) + + lang_prefix = list(langs) + lang_prefix.sort() + pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy + pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") + pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") + if os.path.exists(pickle_invdict): + if return_both and os.path.exists(pickle_dict): + print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) + return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) + elif return_both==False: + print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) + return pickle.load(open(pickle_invdict, 'rb')) + + multiling_titles = {} + inv_dict = {lang:{} for lang in langs} + + def process_entry(line): + parts = line.strip().split('\t') + id = parts[0] + if id in multiling_titles: + raise ValueError("id <%s> already indexed" % id) + + titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) + for lang in titles.keys(): + if lang not in langs: + del titles[lang] + + if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ + or (policy == "IN_ANY_LANG" and len(titles) > 0): + multiling_titles[id] = titles + for lang, title in titles.items(): + if title in inv_dict[lang]: + inv_dict[lang][title].append(id) + inv_dict[lang][title] = [id] + + with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: + completed = 0 + try: + for line in fi: + process_entry(line) + completed += 1 + if completed % 10 == 0: + print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") + print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") + except EOFError: + print("\nUnexpected file ending... saving anyway") + + print("Pickling dictionaries in %s" % data_dir) + pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) + pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) + print("Done") + + return (multiling_titles, inv_dict) if return_both else inv_dict + + +# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 +def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): + latest_all_json_file = join(data_dir,json_file) + + if policy not in policies: + raise ValueError("Policy %s not supported." % policy) + + print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) + + lang_prefix = list(langs) + lang_prefix.sort() + simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) + + def process_entry(last, fo): + global written + id = last["id"] + titles = None + if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): + titles = {lang: last["labels"][lang]["value"] for lang in langs} + elif policy == "IN_ANY_LANG": + titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} + + if titles: + fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) + return True + else: + return False + + written = 0 + with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ + BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: + builder = ObjectBuilder() + completed = 0 + for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): + builder.event(event, value) + if len(builder.value)>1: + if process_entry(builder.value.pop(0), fo): written += 1 + completed += 1 + print("\rCompleted %d\ttitles %d" % (completed,written), end="") + print("") + + #process the last entry + process_entry(builder.value.pop(0)) + + return simple_titles_path + +""" +Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the +specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- +specific version of the same document. Documents are forced to contain version in all specified languages and to contain +a minimum number of words; otherwise it is discarded. +""" +class MinWordsNotReached(Exception): pass +class WrongDocumentFormat(Exception): pass + +def _load_multilang_doc(path, langs, min_words=100): + import xml.etree.ElementTree as ET + from xml.etree.ElementTree import Element, ParseError + try: + root = ET.parse(path).getroot() + doc = {} + for lang in langs: + doc_body = root.find('.//doc[@lang="' + lang + '"]') + if isinstance(doc_body, Element): + n_words = len(doc_body.text.split(' ')) + if n_words >= min_words: + doc[lang] = doc_body.text + else: + raise MinWordsNotReached + else: + raise WrongDocumentFormat + except ParseError: + raise WrongDocumentFormat + return doc + +#returns the multilingual documents mapped by language, and a counter with the number of documents readed +def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): + if pickle_name and os.path.exists(pickle_name): + print("unpickling %s" % pickle_name) + return pickle.load(open(pickle_name, 'rb')) + + multi_docs = list_files(wiki_multi_path) + mling_documents = {l:[] for l in langs} + valid_documents = 0 + minwords_exception = 0 + wrongdoc_exception = 0 + for d,multi_doc in enumerate(multi_docs): + print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % + (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") + doc_path = join(wiki_multi_path, multi_doc) + try: + m_doc = _load_multilang_doc(doc_path, langs, min_words) + valid_documents += 1 + for l in langs: + mling_documents[l].append(m_doc[l]) + except MinWordsNotReached: + minwords_exception += 1 + if deletions: os.remove(doc_path) + except WrongDocumentFormat: + wrongdoc_exception += 1 + if deletions: os.remove(doc_path) + if max_documents>0 and valid_documents>=max_documents: + break + + if pickle_name: + print("Pickling wikipedia documents object in %s" % pickle_name) + pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) + + return mling_documents + +def random_wiki_sample(l_wiki, max_documents): + if max_documents == 0: return None + langs = list(l_wiki.keys()) + assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' + ndocs_per_lang = len(l_wiki[langs[0]]) + if ndocs_per_lang > max_documents: + sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) + for lang in langs: + l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] + return l_wiki + + +if __name__ == "__main__": + + wikipedia_home = "../Datasets/Wikipedia" + + from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs + langs = frozenset(langs) + + simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") + _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') + extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), + out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) + + diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py new file mode 100644 index 0000000..fcfddba --- /dev/null +++ b/src/data/text_preprocessor.py @@ -0,0 +1,34 @@ +from nltk import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import SnowballStemmer + +from data.languages import NLTK_LANGMAP + + +def preprocess_documents(documents, lang): + tokens = NLTKStemTokenizer(lang, verbose=True) + sw = stopwords.words(NLTK_LANGMAP[lang]) + return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] + + +class NLTKStemTokenizer(object): + + def __init__(self, lang, verbose=False): + if lang not in NLTK_LANGMAP: + raise ValueError('Language %s is not supported in NLTK' % lang) + self.verbose=verbose + self.called = 0 + self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) + self.cache = {} + + def __call__(self, doc): + self.called += 1 + if self.verbose: + print("\r\t\t[documents processed %d]" % (self.called), end="") + tokens = word_tokenize(doc) + stems = [] + for t in tokens: + if t not in self.cache: + self.cache[t] = self.wnl.stem(t) + stems.append(self.cache[t]) + return stems \ No newline at end of file diff --git a/src/data/tsr_function__.py b/src/data/tsr_function__.py new file mode 100755 index 0000000..c458029 --- /dev/null +++ b/src/data/tsr_function__.py @@ -0,0 +1,271 @@ +import math + +import numpy as np +from joblib import Parallel, delayed +from scipy.sparse import csr_matrix, csc_matrix +from scipy.stats import t + + +def get_probs(tpr, fpr, pc): + # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) + # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) + pnc = 1.0 - pc + tp = tpr * pc + fn = pc - tp + fp = fpr * pnc + tn = pnc - fp + return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) + + +def apply_tsr(tpr, fpr, pc, tsr): + cell = get_probs(tpr, fpr, pc) + return tsr(cell) + + +def positive_information_gain(cell): + if cell.tpr() < cell.fpr(): + return 0.0 + else: + return information_gain(cell) + + +def posneg_information_gain(cell): + ig = information_gain(cell) + if cell.tpr() < cell.fpr(): + return -ig + else: + return ig + + +def __ig_factor(p_tc, p_t, p_c): + den = p_t * p_c + if den != 0.0 and p_tc != 0: + return p_tc * math.log(p_tc / den, 2) + else: + return 0.0 + + +def information_gain(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ + __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) + + +def information_gain_mod(cell): + return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ + - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) + + +def pointwise_mutual_information(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + + +def gain_ratio(cell): + pc = cell.p_c() + pnc = 1.0 - pc + norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) + return information_gain(cell) / (-norm) + + +def chi_square(cell): + den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() + if den==0.0: return 0.0 + num = gss(cell)**2 + return num / den + + +def relevance_frequency(cell): + a = cell.tp + c = cell.fp + if c == 0: c = 1 + return math.log(2.0 + (a * 1.0 / c), 2) + + +def idf(cell): + if cell.p_f()>0: + return math.log(1.0 / cell.p_f()) + return 0.0 + + +def gss(cell): + return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() + + +def conf_interval(xt, n): + if n>30: + z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 + else: + z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 + p = (xt + 0.5 * z2) / (n + z2) + amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) + return p, amplitude + +def strength(minPosRelFreq, minPos, maxNeg): + if minPos > maxNeg: + return math.log(2.0 * minPosRelFreq, 2.0) + else: + return 0.0 + + +#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) +#however, for some extremely imbalanced dataset caused all documents to be 0 +def conf_weight(cell, cancel_features=False): + c = cell.get_c() + not_c = cell.get_not_c() + tp = cell.tp + fp = cell.fp + + pos_p, pos_amp = conf_interval(tp, c) + neg_p, neg_amp = conf_interval(fp, not_c) + + min_pos = pos_p-pos_amp + max_neg = neg_p+neg_amp + den = (min_pos + max_neg) + minpos_relfreq = min_pos / (den if den != 0 else 1) + + str_tplus = strength(minpos_relfreq, min_pos, max_neg); + + if str_tplus == 0 and not cancel_features: + return 1e-20 + + return str_tplus; + + +class ContTable: + + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp=tp + self.tn=tn + self.fp=fp + self.fn=fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + +def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): + print(f'[selectiong {k} terms]') + nC = Y.shape[1] + FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T + best_features_idx = np.argsort(-FC, axis=0).flatten() + tsr_values = FC.flatten() + selected_indexes_set = set() + selected_indexes = list() + selected_value = list() + from_category = list() + round_robin = iter(best_features_idx) + values_iter = iter(tsr_values) + round=0 + while len(selected_indexes) < k: + term_idx = next(round_robin) + term_val = next(values_iter) + if term_idx not in selected_indexes_set: + selected_indexes_set.add(term_idx) + selected_indexes.append(term_idx) + selected_value.append(term_val) + from_category.append(round) + round = (round + 1) % nC + return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) + + +def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): + tp_ = len(positive_document_indexes & feature_document_indexes) + fp_ = len(feature_document_indexes - positive_document_indexes) + fn_ = len(positive_document_indexes - feature_document_indexes) + tn_ = nD - (tp_ + fp_ + fn_) + return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) + + +def category_tables(feature_sets, category_sets, c, nD, nF): + return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] + + +""" +Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. +Efficiency O(nF x nC x log(S)) where S is the sparse factor +""" +def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): + nD, nF = coocurrence_matrix.shape + nD2, nC = label_matrix.shape + + if nD != nD2: + raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % + (coocurrence_matrix.shape,label_matrix.shape)) + + def nonzero_set(matrix, col): + return set(matrix[:, col].nonzero()[0]) + + if isinstance(coocurrence_matrix, csr_matrix): + coocurrence_matrix = csc_matrix(coocurrence_matrix) + feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] + category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] + cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) + return np.array(cell_matrix) + +# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f +def get_tsr_matrix(cell_matrix, tsr_score_funtion): + nC,nF = cell_matrix.shape + tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] + return np.array(tsr_matrix) + + +""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can +take as input any real-valued feature column (e.g., tf-idf weights). +feat is the feature vector, and c is a binary classification vector. +This implementation covers only the binary case, while the formula is defined for multiclass +single-label scenarios, for which the version [2] might be preferred. +[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. +[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. +""" +def fisher_score_binary(feat, c): + neg = np.ones_like(c) - c + + npos = np.sum(c) + nneg = np.sum(neg) + + mupos = np.mean(feat[c == 1]) + muneg = np.mean(feat[neg == 1]) + mu = np.mean(feat) + + stdpos = np.std(feat[c == 1]) + stdneg = np.std(feat[neg == 1]) + + num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) + den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) + + if den>0: + return num / den + else: + return num diff --git a/src/funnelling.py b/src/funnelling.py new file mode 100644 index 0000000..812a937 --- /dev/null +++ b/src/funnelling.py @@ -0,0 +1,124 @@ +from models.learners import * +from util.common import _normalize +from view_generators import VanillaFunGen + + +class DocEmbedderList: + """ + Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be + contained by this class in order to seamlessly train the overall architecture. + """ + def __init__(self, embedder_list, probabilistic=True): + """ + Init the DocEmbedderList. + :param embedder_list: list of embedders to be deployed + :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not + """ + assert len(embedder_list) != 0, 'Embedder list cannot be empty!' + self.embedders = embedder_list + self.probabilistic = probabilistic + if probabilistic: + _tmp = [] + for embedder in self.embedders: + if isinstance(embedder, VanillaFunGen): + _tmp.append(embedder) + else: + _tmp.append(FeatureSet2Posteriors(embedder)) + self.embedders = _tmp + + def fit(self, lX, ly): + """ + Fit all the ViewGenerators contained by DocEmbedderList. + :param lX: + :param ly: + :return: self + """ + for embedder in self.embedders: + embedder.fit(lX, ly) + return self + + def transform(self, lX): + """ + Project documents by means of every ViewGenerators. Projections are then averaged together and returned. + :param lX: + :return: common latent space (averaged). + """ + langs = sorted(lX.keys()) + lZparts = {lang: None for lang in langs} + + for embedder in self.embedders: + lZ = embedder.transform(lX) + for lang in langs: + Z = lZ[lang] + if lZparts[lang] is None: + lZparts[lang] = Z + else: + lZparts[lang] += Z + n_embedders = len(self.embedders) + return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class FeatureSet2Posteriors: + """ + Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of + a multiclass SVM. + """ + def __init__(self, embedder, l2=True, n_jobs=-1): + """ + Init the class. + :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities. + :param l2: bool, whether to apply or not L2 normalization to the projection + :param n_jobs: int, number of concurrent workers. + """ + self.embedder = embedder + self.l2 = l2 + self.n_jobs = n_jobs + self.prob_classifier = MetaClassifier( + SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + + def fit(self, lX, ly): + lZ = self.embedder.fit_transform(lX, ly) + self.prob_classifier.fit(lZ, ly) + return self + + def transform(self, lX): + lP = self.predict_proba(lX) + lP = _normalize(lP, self.l2) + return lP + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + def predict(self, lX): + lZ = self.embedder.transform(lX) + return self.prob_classifier.predict(lZ) + + def predict_proba(self, lX): + lZ = self.embedder.transform(lX) + return self.prob_classifier.predict_proba(lZ) + + +class Funnelling: + """ + Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders. + The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by + the first-tier learners. + """ + def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): + self.first_tier = first_tier + self.meta = meta_classifier + self.n_jobs = n_jobs + + def fit(self, lX, ly): + print('## Fitting first-tier learners!') + lZ = self.first_tier.fit_transform(lX, ly) + print('## Fitting meta-learner!') + self.meta.fit(lZ, ly) + + def predict(self, lX): + lZ = self.first_tier.transform(lX) + ly = self.meta.predict(lZ) + return ly diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..ebc43a3 --- /dev/null +++ b/src/main.py @@ -0,0 +1,167 @@ +from argparse import ArgumentParser + +from data.dataset_builder import MultilingualDataset +from funnelling import * +from util.common import MultilingualIndex, get_params, get_method_name +from util.evaluation import evaluate +from util.results_csv import CSVlog +from view_generators import * + + +def main(args): + assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \ + 'empty set of document embeddings is not allowed!' + + print('Running generalized funnelling...') + + data = MultilingualDataset.load(args.dataset) + data.set_view(languages=['it', 'fr']) + data.show_dimensions() + lX, ly = data.training() + lXte, lyte = data.test() + + # Init multilingualIndex - mandatory when deploying Neural View Generators... + if args.gru_embedder or args.bert_embedder: + multilingualIndex = MultilingualIndex() + lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) + multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) + + # Init ViewGenerators and append them to embedder_list + embedder_list = [] + if args.post_embedder: + posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) + embedder_list.append(posteriorEmbedder) + + if args.muse_embedder: + museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) + embedder_list.append(museEmbedder) + + if args.wce_embedder: + wceEmbedder = WordClassGen(n_jobs=args.n_jobs) + embedder_list.append(wceEmbedder) + + if args.gru_embedder: + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, + nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs) + embedder_list.append(rnnEmbedder) + + if args.bert_embedder: + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) + embedder_list.append(bertEmbedder) + + # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier + docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), + meta_parameters=get_params(optimc=args.optimc)) + + # Init Funnelling Architecture + gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) + + # Training --------------------------------------- + print('\n[Training Generalized Funnelling]') + time_init = time() + time_tr = time() + gfun.fit(lX, ly) + time_tr = round(time() - time_tr, 3) + print(f'Training completed in {time_tr} seconds!') + + # Testing ---------------------------------------- + print('\n[Testing Generalized Funnelling]') + time_te = time() + ly_ = gfun.predict(lXte) + l_eval = evaluate(ly_true=lyte, ly_pred=ly_) + time_te = round(time() - time_te, 3) + print(f'Testing completed in {time_te} seconds!') + + # Logging --------------------------------------- + print('\n[Results]') + results = CSVlog(args.csv_dir) + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') + if results is not None: + _id, _dataset = get_method_name(args) + results.add_row(method='gfun', + setting=_id, + optimc=args.optimc, + sif='True', + zscore='True', + l2='True', + dataset=_dataset, + time_tr=time_tr, + time_te=time_te, + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') + print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) + + overall_time = round(time() - time_init, 3) + exit(f'\nExecuted in: {overall_time} seconds!') + + +if __name__ == '__main__': + parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani') + + parser.add_argument('dataset', help='Path to the dataset') + + parser.add_argument('-o', '--output', dest='csv_dir', + help='Result file (default ../csv_log/gfun_results.csv)', type=str, + default='csv_logs/gfun/gfun_results.csv') + + parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', + help='deploy posterior probabilities embedder to compute document embeddings', + default=False) + + parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true', + help='deploy (supervised) Word-Class embedder to the compute document embeddings', + default=False) + + parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true', + help='deploy (pretrained) MUSE embedder to compute document embeddings', + default=False) + + parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true', + help='deploy multilingual Bert to compute document embeddings', + default=False) + + parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', + help='deploy a GRU in order to compute document embeddings', + default=False) + + parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', + help='Optimize SVMs C hyperparameter', + default=False) + + parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, + help='Number of max epochs to train Recurrent embedder (i.e., -g)') + + parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, + help='Number of parallel jobs (default is -1, all)', + default=-1) + + parser.add_argument('--muse_dir', dest='muse_dir', type=str, + help='Path to the MUSE polylingual word embeddings (default ../embeddings)', + default='../embeddings') + + parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', + help='Deploy WCE embedding as embedding layer of the GRU View Generator', + default=False) + + parser.add_argument('--gru_dir', dest='gru_dir', type=str, + help='Set the path to a pretrained GRU model (i.e., -g view generator)', + default=None) + + parser.add_argument('--bert_dir', dest='bert_dir', type=str, + help='Set the path to a pretrained mBERT model (i.e., -b view generator)', + default=None) + + parser.add_argument('--gpus', help='specifies how many GPUs to use per node', + default=None) + + args = parser.parse_args() + main(args) diff --git a/src/models/helpers.py b/src/models/helpers.py new file mode 100755 index 0000000..b466f28 --- /dev/null +++ b/src/models/helpers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + + +def init_embeddings(pretrained, vocab_size, learnable_length): + """ + Compute the embedding matrix + :param pretrained: + :param vocab_size: + :param learnable_length: + :return: + """ + pretrained_embeddings = None + pretrained_length = 0 + if pretrained is not None: + pretrained_length = pretrained.shape[1] + assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' + pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + # requires_grad=False sets the embedding layer as NOT trainable + pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) + + learnable_embeddings = None + if learnable_length > 0: + learnable_embeddings = nn.Embedding(vocab_size, learnable_length) + + embedding_length = learnable_length + pretrained_length + assert embedding_length > 0, '0-size embeddings' + return pretrained_embeddings, learnable_embeddings, embedding_length + + +def embed(model, input, lang): + input_list = [] + if model.lpretrained_embeddings[lang]: + input_list.append(model.lpretrained_embeddings[lang](input)) + if model.llearnable_embeddings[lang]: + input_list.append(model.llearnable_embeddings[lang](input)) + return torch.cat(tensors=input_list, dim=2) + + +def embedding_dropout(input, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from #length of the supervised embedding + l = input.shape[2] #total embedding length + corr = (1 - p) + input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) + input /= (1 - (p * m / l)) + + return input diff --git a/src/models/learners.py b/src/models/learners.py new file mode 100644 index 0000000..2654109 --- /dev/null +++ b/src/models/learners.py @@ -0,0 +1,224 @@ +import time + +import numpy as np +from joblib import Parallel, delayed +from scipy.sparse import issparse +from sklearn.model_selection import GridSearchCV +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import SVC + +from util.standardizer import StandardizeTransformer + + +def get_learner(calibrate=False, kernel='linear', C=1): + """ + instantiate scikit Support Vector Classifier + :param calibrate: boolean, whether to return posterior probabilities or not + :param kernel: string,kernel to be applied to the SVC + :param C: int or dict {'C': list of integer}, Regularization parameter + :return: Support Vector Classifier + """ + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) + + +def _sort_if_sparse(X): + if issparse(X) and not X.has_sorted_indices: + X.sort_indices() + + +def _joblib_transform_multiling(transformer, lX, n_jobs=-1): + if n_jobs == 1: + return {lang: transformer(lX[lang]) for lang in lX.keys()} + else: + langs = list(lX.keys()) + transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) + return {lang: transformations[i] for i, lang in enumerate(langs)} + + +class TrivialRejector: + def fit(self, X, y): + self.cats = y.shape[1] + return self + + def decision_function(self, X): return np.zeros((X.shape[0], self.cats)) + + def predict(self, X): return np.zeros((X.shape[0], self.cats)) + + def predict_proba(self, X): return np.zeros((X.shape[0], self.cats)) + + def best_params(self): return {} + + +class NaivePolylingualClassifier: + """ + Is a mere set of independet MonolingualClassifiers + """ + + def __init__(self, base_learner, parameters=None, n_jobs=-1): + self.base_learner = base_learner + self.parameters = parameters + self.model = None + self.n_jobs = n_jobs + + def fit(self, lX, ly): + """ + trains the independent monolingual classifiers + :param lX: a dictionary {language_label: X csr-matrix} + :param ly: a dictionary {language_label: y np.array} + :return: self + """ + tinit = time.time() + assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' + langs = list(lX.keys()) + for lang in langs: + _sort_if_sparse(lX[lang]) + + models = Parallel(n_jobs=self.n_jobs)\ + (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for + lang in langs) + + self.model = {lang: models[i] for i, lang in enumerate(langs)} + self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs} + self.time = time.time() - tinit + return self + + def decision_function(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of classification scores for each class + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def predict_proba(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of probabilities that each document belongs to each class + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def predict(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of predictions + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' + if self.n_jobs == 1: + return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()} + else: + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def best_params(self): + return {lang: model.best_params() for lang, model in self.model.items()} + + +class MonolingualClassifier: + + def __init__(self, base_learner, parameters=None, n_jobs=-1): + self.learner = base_learner + self.parameters = parameters + self.model = None + self.n_jobs = n_jobs + self.best_params_ = None + + def fit(self, X, y): + if X.shape[0] == 0: + print('Warning: X has 0 elements, a trivial rejector will be created') + self.model = TrivialRejector().fit(X, y) + self.empty_categories = np.arange(y.shape[1]) + return self + + tinit = time.time() + _sort_if_sparse(X) + self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten() + # multi-class format + if len(y.shape) == 2: + if self.parameters is not None: + self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} + for params in self.parameters] + self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) + else: + self.model = self.learner + raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' + 'the labels across languages') + + # parameter optimization? + if self.parameters: + print('debug: optimizing parameters:', self.parameters) + self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, + error_score=0, verbose=10) + + print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') + self.model.fit(X, y) + if isinstance(self.model, GridSearchCV): + self.best_params_ = self.model.best_params_ + print('best parameters: ', self.best_params_) + self.time = time.time() - tinit + return self + + def decision_function(self, X): + assert self.model is not None, 'predict called before fit' + _sort_if_sparse(X) + return self.model.decision_function(X) + + def predict_proba(self, X): + assert self.model is not None, 'predict called before fit' + assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' + _sort_if_sparse(X) + return self.model.predict_proba(X) + + def predict(self, X): + assert self.model is not None, 'predict called before fit' + _sort_if_sparse(X) + return self.model.predict(X) + + def best_params(self): + return self.best_params_ + + +class MetaClassifier: + + def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): + self.n_jobs = n_jobs + self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) + self.standardize_range = standardize_range + + def fit(self, lZ, ly): + tinit = time.time() + Z, y = self.stack(lZ, ly) + + self.standardizer = StandardizeTransformer(range=self.standardize_range) + Z = self.standardizer.fit_transform(Z) + + print('fitting the Z-space of shape={}'.format(Z.shape)) + self.model.fit(Z, y) + self.time = time.time() - tinit + + def stack(self, lZ, ly=None): + langs = list(lZ.keys()) + Z = np.vstack([lZ[lang] for lang in langs]) + if ly is not None: + y = np.vstack([ly[lang] for lang in langs]) + return Z, y + else: + return Z + + def predict(self, lZ): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + + def predict_proba(self, lZ): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) + diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py new file mode 100755 index 0000000..7f2cf59 --- /dev/null +++ b/src/models/lstm_class.py @@ -0,0 +1,113 @@ +#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py +from torch.autograd import Variable + +from models.helpers import * + + +class RNNMultilingualClassifier(nn.Module): + + def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, + drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, + bert_embeddings=False): + + super(RNNMultilingualClassifier, self).__init__() + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.post_probabilities = post_probabilities + self.bert_embeddings = bert_embeddings + assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' + + self.lpretrained_embeddings = nn.ModuleDict() + self.llearnable_embeddings = nn.ModuleDict() + self.embedding_length = None + self.langs = sorted(lvocab_size.keys()) + self.only_post = only_post + + self.n_layers = 1 + self.n_directions = 1 + + self.dropout = nn.Dropout(0.6) + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + if only_post==False: + for l in self.langs: + pretrained = lpretrained[l] if lpretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, lvocab_size[l], learnable_length + ) + lpretrained_embeddings[l] = pretrained_embeddings + llearnable_embeddings[l] = learnable_embeddings + self.embedding_length = embedding_length + + # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.lpretrained_embeddings.update(lpretrained_embeddings) + self.llearnable_embeddings.update(llearnable_embeddings) + + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + + if only_post: + self.label = nn.Linear(output_size, output_size) + elif post_probabilities and not bert_embeddings: + self.label = nn.Linear(ff2 + output_size, output_size) + elif bert_embeddings and not post_probabilities: + self.label = nn.Linear(ff2 + 768, output_size) + elif post_probabilities and bert_embeddings: + self.label = nn.Linear(ff2 + output_size + 768, output_size) + else: + self.label = nn.Linear(ff2, output_size) + + def forward(self, input, post, bert_embed, lang): + if self.only_post: + doc_embedding = post + else: + doc_embedding = self.transform(input, lang) + if self.post_probabilities: + doc_embedding = torch.cat([doc_embedding, post], dim=1) + if self.bert_embeddings: + doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) + + logits = self.label(doc_embedding) + return logits + + def transform(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # output, (_, _) = self.lstm(input, (h_0, c_0)) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def finetune_pretrained(self): + for l in self.langs: + self.lpretrained_embeddings[l].requires_grad = True + self.lpretrained_embeddings[l].weight.requires_grad = True + + def get_embeddings(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + return output.cpu().detach().numpy() + diff --git a/src/models/pl_bert.py b/src/models/pl_bert.py new file mode 100644 index 0000000..afb28b5 --- /dev/null +++ b/src/models/pl_bert.py @@ -0,0 +1,183 @@ +import pytorch_lightning as pl +import torch +from torch.optim.lr_scheduler import StepLR +from transformers import BertForSequenceClassification, AdamW + +from util.common import define_pad_length, pad +from util.pl_metrics import CustomF1, CustomK + + +class BertModel(pl.LightningModule): + + def __init__(self, output_size, stored_path, gpus=None): + """ + Init Bert model. + :param output_size: + :param stored_path: + :param gpus: + """ + super().__init__() + self.loss = torch.nn.BCEWithLogitsLoss() + self.gpus = gpus + self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) + self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics to compute metrics at epoch level + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + + if stored_path: + self.bert = BertForSequenceClassification.from_pretrained(stored_path, + num_labels=output_size, + output_hidden_states=True) + else: + self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', + num_labels=output_size, + output_hidden_states=True) + self.save_hyperparameters() + + def forward(self, X): + logits = self.bert(X) + return logits + + def training_step(self, train_batch, batch_idx): + X, y, _, batch_langs = train_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.FloatTensor) + y = y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + lX, ly = self._reconstruct_dict(predictions, y, batch_langs) + return {'loss': loss, 'pred': lX, 'target': ly} + + def training_epoch_end(self, outputs): + langs = [] + for output in outputs: + langs.extend(list(output['pred'].keys())) + langs = set(langs) + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + res_macroF1 = {lang: [] for lang in langs} + res_microF1 = {lang: [] for lang in langs} + res_macroK = {lang: [] for lang in langs} + res_microK = {lang: [] for lang in langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) + + def validation_step(self, val_batch, batch_idx): + X, y, _, batch_langs = val_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.FloatTensor) + y = y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return {'loss': loss} + + def test_step(self, test_batch, batch_idx): + X, y, _, batch_langs = test_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.FloatTensor) + y = y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return + + def configure_optimizers(self, lr=3e-5, weight_decay=0.01): + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in self.bert.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay}, + {'params': [p for n, p in self.bert.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=lr) + scheduler = StepLR(optimizer, step_size=25, gamma=0.1) + return [optimizer], [scheduler] + + def encode(self, lX, batch_size=64): + with torch.no_grad(): + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i + batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i + batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) + batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') + _, output = self.forward(batch) + doc_embeds = output[-1][:, 0, :] + l_embed[lang].append(doc_embeds.cpu()) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0).numpy() + return l_embed + + @staticmethod + def _reconstruct_dict(predictions, y, batch_langs): + reconstructed_x = {lang: [] for lang in set(batch_langs)} + reconstructed_y = {lang: [] for lang in set(batch_langs)} + for i, pred in enumerate(predictions): + reconstructed_x[batch_langs[i]].append(pred) + reconstructed_y[batch_langs[i]].append(y[i]) + for k, v in reconstructed_x.items(): + reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) + for k, v in reconstructed_y.items(): + reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) + return reconstructed_x, reconstructed_y diff --git a/src/models/pl_gru.py b/src/models/pl_gru.py new file mode 100644 index 0000000..afb12e6 --- /dev/null +++ b/src/models/pl_gru.py @@ -0,0 +1,266 @@ +# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from torch import nn +from torch.autograd import Variable +from torch.optim.lr_scheduler import StepLR +from transformers import AdamW + +from models.helpers import init_embeddings +from util.common import define_pad_length, pad +from util.pl_metrics import CustomF1, CustomK + + +class RecurrentModel(pl.LightningModule): + def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, + drop_embedding_range, drop_embedding_prop, gpus=None): + """ + Init RNN model. + :param lPretrained: + :param langs: + :param output_size: + :param hidden_size: + :param lVocab_size: + :param learnable_length: + :param drop_embedding_range: + :param drop_embedding_prop: + :param gpus: + """ + super().__init__() + self.gpus = gpus + self.langs = langs + self.lVocab_size = lVocab_size + self.learnable_length = learnable_length + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.loss = torch.nn.BCEWithLogitsLoss() + + self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) + self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics to compute metrics at epoch level + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + + self.lPretrained_embeddings = nn.ModuleDict() + self.lLearnable_embeddings = nn.ModuleDict() + + self.n_layers = 1 + self.n_directions = 1 + self.dropout = nn.Dropout(0.6) + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + + for lang in self.langs: + pretrained = lPretrained[lang] if lPretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, self.lVocab_size[lang], self.learnable_length) + lpretrained_embeddings[lang] = pretrained_embeddings + llearnable_embeddings[lang] = learnable_embeddings + self.embedding_length = embedding_length + + self.lPretrained_embeddings.update(lpretrained_embeddings) + self.lLearnable_embeddings.update(llearnable_embeddings) + + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + self.label = nn.Linear(ff2, self.output_size) + + # TODO: setting lPretrained to None, letting it to its original value will "bug" first validation + # step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + lPretrained = None + self.save_hyperparameters() + + def forward(self, lX): + l_embed = [] + for lang in sorted(lX.keys()): + doc_embedding = self.transform(lX[lang], lang) + l_embed.append(doc_embedding) + embed = torch.cat(l_embed, dim=0) + logits = self.label(embed) + return logits + + def transform(self, X, lang): + batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def encode(self, lX, l_pad, batch_size=128): + """ + Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. + :param lX: + :param l_pad: + :param batch_size: + :return: + """ + with torch.no_grad(): + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i+batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i+batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) + X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') + _batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + l_embed[lang].append(output.cpu()) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0).numpy() + return l_embed + + def training_step(self, train_batch, batch_idx): + lX, ly = train_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + y = torch.cat(_ly, dim=0) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + re_lX = self._reconstruct_dict(predictions, ly) + return {'loss': loss, 'pred': re_lX, 'target': ly} + + def training_epoch_end(self, outputs): + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + res_macroF1 = {lang: [] for lang in self.langs} + res_microF1 = {lang: [] for lang in self.langs} + res_macroK = {lang: [] for lang in self.langs} + res_microK = {lang: [] for lang in self.langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in self.langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) + + def validation_step(self, val_batch, batch_idx): + lX, ly = val_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + loss = self.loss(logits, ly) + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) + self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return {'loss': loss} + + def test_step(self, test_batch, batch_idx): + lX, ly = test_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return + + def embed(self, X, lang): + input_list = [] + if self.lPretrained_embeddings[lang]: + input_list.append(self.lPretrained_embeddings[lang](X)) + if self.lLearnable_embeddings[lang]: + input_list.append(self.lLearnable_embeddings[lang](X)) + return torch.cat(tensors=input_list, dim=2) + + def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from # length of the supervised embedding + l = X.shape[2] # total embedding length + corr = (1 - p) + X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p) + X /= (1 - (p * m / l)) + return X + + def configure_optimizers(self): + optimizer = AdamW(self.parameters(), lr=1e-3) + scheduler = StepLR(optimizer, step_size=25, gamma=0.5) + return [optimizer], [scheduler] + + @staticmethod + def _reconstruct_dict(X, ly): + reconstructed = {} + _start = 0 + for lang in sorted(ly.keys()): + lang_batchsize = len(ly[lang]) + reconstructed[lang] = X[_start:_start+lang_batchsize] + _start += lang_batchsize + return reconstructed diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..4546a4a --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,12 @@ +transformers==2.11.0 +pandas==0.25.3 +numpy==1.17.4 +joblib==0.14.0 +tqdm==4.50.2 +pytorch_lightning==1.1.2 +torch==1.3.1 +nltk==3.4.5 +scipy==1.3.3 +rdflib==4.2.2 +torchtext==0.4.0 +scikit_learn==0.24.1 diff --git a/src/run.sh b/src/run.sh new file mode 100644 index 0000000..04365f9 --- /dev/null +++ b/src/run.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +for i in {0..10..1} +do + python main.py --gpus 0 +done \ No newline at end of file diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py new file mode 100644 index 0000000..4a3d712 --- /dev/null +++ b/src/util/SIF_embed.py @@ -0,0 +1,59 @@ +import numpy as np +from sklearn.decomposition import TruncatedSVD + + +def get_weighted_average(We, x, w): + """ + Compute the weighted average vectors + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in sentence i + :param w: w[i, :] are the weights for the words in sentence i + :return: emb[i, :] are the weighted average vector for sentence i + """ + n_samples = x.shape[0] + emb = np.zeros((n_samples, We.shape[1])) + for i in range(n_samples): + emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) + return emb + + +def compute_pc(X,npc=1): + """ + Compute the principal components. + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: component_[i,:] is the i-th pc + """ + svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) + svd.fit(X) + return svd.components_ + + +def remove_pc(X, npc=1): + """ + Remove the projection on the principal components + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: XX[i, :] is the data point after removing its projection + """ + pc = compute_pc(X, npc) + if npc == 1: + XX = X - X.dot(pc.transpose()) * pc + else: + XX = X - X.dot(pc.transpose()).dot(pc) + return XX + + +def SIF_embedding(We, x, w, params): + """ + Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in the i-th sentence + :param w: w[i, :] are the weights for the words in the i-th sentence + :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component + :return: emb, emb[i, :] is the embedding for sentence i + """ + emb = get_weighted_average(We, x, w) + if params.rmpc > 0: + emb = remove_pc(emb, params.rmpc) + return emb \ No newline at end of file diff --git a/src/util/common.py b/src/util/common.py new file mode 100644 index 0000000..61ac52f --- /dev/null +++ b/src/util/common.py @@ -0,0 +1,384 @@ +import numpy as np +import torch +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import normalize + +from util.embeddings_manager import supervised_embeddings_tfidf + + +class TfidfVectorizerMultilingual: + + def __init__(self, **kwargs): + self.kwargs = kwargs + + def fit(self, lX, ly=None): + self.langs = sorted(lX.keys()) + self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + return self + + def transform(self, lX): + return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} + + def fit_transform(self, lX, ly=None): + return self.fit(lX, ly).transform(lX) + + def vocabulary(self, l=None): + if l is None: + return {l: self.vectorizer[l].vocabulary_ for l in self.langs} + else: + return self.vectorizer[l].vocabulary_ + + def get_analyzer(self, l=None): + if l is None: + return {l: self.vectorizer[l].build_analyzer() for l in self.langs} + else: + return self.vectorizer[l].build_analyzer() + + +def _normalize(lX, l2=True): + return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX + + +def none_dict(langs): + return {l: None for l in langs} + + +class MultilingualIndex: + def __init__(self): + """ + Class that contains monolingual Indexes + """ + self.l_index = {} + self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None): + self.langs = sorted(l_devel_raw.keys()) + self.l_vectorizer.fit(l_devel_raw) + l_vocabulary = self.l_vectorizer.vocabulary() + l_analyzer = self.l_vectorizer.get_analyzer() + if l_pretrained_vocabulary is None: + l_pretrained_vocabulary = none_dict(self.langs) + + for lang in self.langs: + # Init monolingual Index + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], + lang) + # call to index() function of monolingual Index + self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) + + def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): + for l, index in self.l_index.items(): + index.train_val_split(val_prop, max_val, seed=seed) + + def embedding_matrices(self, lpretrained, supervised): + """ + Extract from pretrained embeddings words that are found in the training dataset, then for each language + calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated + to the unsupervised vectors). + :param lpretrained: dict {lang : matrix of word-embeddings } + :param supervised: bool, whether to deploy Word-Class Embeddings or not + :return: self + """ + lXtr = self.get_lXtr() if supervised else none_dict(self.langs) + lYtr = self.l_train_target() if supervised else none_dict(self.langs) + lWordList = self.get_wordlist() + lExtracted = lpretrained.extract(lWordList) + for lang, index in self.l_index.items(): + # if supervised concatenate embedding matrices of pretrained unsupervised + # and supervised word-class embeddings + index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang]) + self.sup_range = index.wce_range + return self + + def get_wordlist(self): + wordlist = {} + for lang, index in self.l_index.items(): + wordlist[lang] = index.get_word_list() + return wordlist + + def get_raw_lXtr(self): + lXtr_raw = {k: [] for k in self.langs} + lYtr_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXtr_raw[lang] = self.l_index[lang].train_raw + lYtr_raw[lang] = self.l_index[lang].train_raw + return lXtr_raw + + def get_raw_lXva(self): + lXva_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXva_raw[lang] = self.l_index[lang].val_raw + + return lXva_raw + + def get_raw_lXte(self): + lXte_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXte_raw[lang] = self.l_index[lang].test_raw + + return lXte_raw + + def get_lXtr(self): + if not hasattr(self, 'lXtr'): + self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) + return self.lXtr + + def get_lXva(self): + if not hasattr(self, 'lXva'): + self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) + return self.lXva + + def get_lXte(self): + if not hasattr(self, 'lXte'): + self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) + return self.lXte + + def get_target_dim(self): + return self.l_index[self.langs[0]].devel_target.shape[1] + + def l_vocabsize(self): + return {l: index.vocabsize for l, index in self.l_index.items()} + + def l_embeddings(self): + return {l: index.embedding_matrix for l, index in self.l_index.items()} + + def l_pad(self): + return {l: index.pad_index for l, index in self.l_index.items()} + + def l_train_index(self): + return {l: index.train_index for l, index in self.l_index.items()} + + def l_train_raw_index(self): + return {l: index.train_raw for l, index in self.l_index.items()} + + def l_train_target(self): + return {l: index.train_target for l, index in self.l_index.items()} + + def l_val_index(self): + return {l: index.val_index for l, index in self.l_index.items()} + + def l_val_raw_index(self): + return {l: index.val_raw for l, index in self.l_index.items()} + + def l_test_raw_index(self): + return {l: index.test_raw for l, index in self.l_index.items()} + + def l_devel_raw_index(self): + return {l: index.devel_raw for l, index in self.l_index.items()} + + def l_val_target(self): + return {l: index.val_target for l, index in self.l_index.items()} + + def l_test_target(self): + return {l: index.test_target for l, index in self.l_index.items()} + + def l_test_index(self): + return {l: index.test_index for l, index in self.l_index.items()} + + def l_devel_index(self): + return {l: index.devel_index for l, index in self.l_index.items()} + + def l_devel_target(self): + return {l: index.devel_target for l, index in self.l_index.items()} + + def l_train(self): + return self.l_train_index(), self.l_train_target() + + def l_val(self): + return self.l_val_index(), self.l_val_target() + + def l_test(self): + return self.l_test_index(), self.l_test_target() + + def l_train_raw(self): + return self.l_train_raw_index(), self.l_train_target() + + def l_val_raw(self): + return self.l_val_raw_index(), self.l_val_target() + + def l_test_raw(self): + return self.l_test_raw_index(), self.l_test_target() + + def l_devel_raw(self): + return self.l_devel_raw_index(), self.l_devel_target() + + def get_l_pad_index(self): + return {l: index.get_pad_index() for l, index in self.l_index.items()} + + +class Index: + def __init__(self, devel_raw, devel_target, test_raw, test_target, lang): + """ + Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into + training and validation. + :param devel_raw: list of strings, list of raw training texts + :param devel_target: + :param test_raw: list of strings, list of raw test texts + :param lang: list, list of languages contained in the dataset + """ + self.lang = lang + self.devel_raw = devel_raw + self.devel_target = devel_target + self.test_raw = test_raw + self.test_target = test_target + + def index(self, pretrained_vocabulary, analyzer, vocabulary): + self.word2index = dict(vocabulary) + known_words = set(self.word2index.keys()) + if pretrained_vocabulary is not None: + known_words.update(pretrained_vocabulary) + + self.word2index['UNKTOKEN'] = len(self.word2index) + self.word2index['PADTOKEN'] = len(self.word2index) + self.unk_index = self.word2index['UNKTOKEN'] + self.pad_index = self.word2index['PADTOKEN'] + + # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) + self.out_of_vocabulary = dict() + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) + + self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) + + print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') + + def get_pad_index(self): + return self.pad_index + + def train_val_split(self, val_prop, max_val, seed): + devel = self.devel_index + target = self.devel_target + devel_raw = self.devel_raw + + val_size = int(min(len(devel) * val_prop, max_val)) + + self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ + train_test_split( + devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) + + print( + f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + + def get_word_list(self): + def extract_word_list(word2index): + return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])] + + word_list = extract_word_list(self.word2index) + word_list += extract_word_list(self.out_of_vocabulary) + return word_list + + def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): + print(f'[generating embedding matrix for lang {self.lang}]') + + self.wce_range = None + embedding_parts = [] + + if pretrained is not None: + print('\t[pretrained-matrix]') + embedding_parts.append(pretrained) + del pretrained + + if supervised: + print('\t[supervised-matrix]') + F = supervised_embeddings_tfidf(Xtr, Ytr) + num_missing_rows = self.vocabsize - F.shape[0] + F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) + F = torch.from_numpy(F).float() + + offset = 0 + if embedding_parts: + offset = embedding_parts[0].shape[1] + self.wce_range = [offset, offset + F.shape[1]] + embedding_parts.append(F) + + self.embedding_matrix = torch.cat(embedding_parts, dim=1) + + print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') + + +def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): + """ + Index (i.e., replaces word strings with numerical indexes) a list of string documents + :param data: list of string documents + :param vocab: a fixed mapping [str]->[int] of words to indexes + :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained + because they are anyway contained in a pre-trained embedding set that we know in advance) + :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words + :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep + :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that + are not in the original vocab but that are in the known_words + :return: + """ + indexes = [] + vocabsize = len(vocab) + unk_count = 0 + knw_count = 0 + out_count = 0 + # pbar = tqdm(data, desc=f'indexing') + for text in data: + words = analyzer(text) + index = [] + for word in words: + if word in vocab: + idx = vocab[word] + else: + if word in known_words: + if word not in out_of_vocabulary: + out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary) + idx = out_of_vocabulary[word] + out_count += 1 + else: + idx = unk_index + unk_count += 1 + index.append(idx) + indexes.append(index) + knw_count += len(index) + # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' + # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') + return indexes + + +def is_true(tensor, device): + return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) + + +def is_false(tensor, device): + return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) + + +def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + +def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list + + +def get_params(optimc=False): + if not optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + + +def get_method_name(args): + _id = '' + _id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder] + _id_name = ['X', 'W', 'M', 'B', 'G'] + for i, conf in enumerate(_id_conf): + if conf: + _id += _id_name[i] + _id = _id if not args.gru_wce else _id + '_wce' + _dataset_path = args.dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + return _id, dataset_id diff --git a/src/util/embeddings_manager.py b/src/util/embeddings_manager.py new file mode 100644 index 0000000..1d708fa --- /dev/null +++ b/src/util/embeddings_manager.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod + +import numpy as np +import torch +from torchtext.vocab import Vectors + +from util.SIF_embed import remove_pc + + +class PretrainedEmbeddings(ABC): + + def __init__(self): + super().__init__() + + @abstractmethod + def vocabulary(self): pass + + @abstractmethod + def dim(self): pass + + @classmethod + def reindex(cls, words, word2index): + if isinstance(words, dict): + words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] + + source_idx, target_idx = [], [] + for i, word in enumerate(words): + if word not in word2index: + continue + j = word2index[word] + source_idx.append(i) + target_idx.append(j) + source_idx = np.asarray(source_idx) + target_idx = np.asarray(target_idx) + return source_idx, target_idx + + +class MuseLoader: + def __init__(self, langs, cache): + self.langs = langs + self.lEmbed = {} + self.lExtracted = {} + for lang in self.langs: + print(f'Loading vectors for {lang}...') + self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache) + + def dim(self): + return self.lEmbed[list(self.lEmbed.keys())[0]].dim + + def vocabulary(self): + return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs} + + def extract(self, lVoc): + """ + Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes + are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer) + :param lVoc: dict {lang : {word : id}} + :return: torch embedding matrix of extracted embeddings i.e., words in lVoc + """ + for lang, words in lVoc.items(): + print(f'Extracting words for lang {lang}...') + # words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0] + source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi) + extraction = torch.zeros((len(words), self.dim())) + extraction[source_id] = self.lEmbed[lang].vectors[target_id] + self.lExtracted[lang] = extraction + return self.lExtracted + + def get_lEmbeddings(self): + return {lang: self.lEmbed[lang].vectors for lang in self.langs} + + +def XdotM(X, M, sif): + E = X.dot(M) + if sif: + E = remove_pc(E, npc=1) + return E + + +def wce_matrix(X, Y): + wce = supervised_embeddings_tfidf(X, Y) + wce = zscores(wce, axis=0) + return wce + + +def supervised_embeddings_tfidf(X, Y): + tfidf_norm = X.sum(axis=0) + tfidf_norm[tfidf_norm == 0] = 1 + F = (X.T).dot(Y) / tfidf_norm.T + return F + + +def zscores(X, axis=0): + """ + scipy.stats.zscores does not avoid division by 0, which can indeed occur + :param X: + :param axis: + :return: + """ + std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) + mean = np.mean(X, axis=axis) + return (X - mean) / std + + diff --git a/src/util/evaluation.py b/src/util/evaluation.py new file mode 100644 index 0000000..010d0e9 --- /dev/null +++ b/src/util/evaluation.py @@ -0,0 +1,20 @@ +import numpy as np +from joblib import Parallel, delayed + +from util.metrics import * + + +def evaluation_metrics(y, y_): + if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label + raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') + else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers + return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) + + +def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): + if n_jobs == 1: + return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} + else: + langs = list(ly_true.keys()) + evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) + return {lang: evals[i] for i, lang in enumerate(langs)} diff --git a/src/util/file.py b/src/util/file.py new file mode 100644 index 0000000..8754f5a --- /dev/null +++ b/src/util/file.py @@ -0,0 +1,50 @@ +import urllib +from os import listdir, makedirs +from os.path import isdir, isfile, join, exists, dirname +from pathlib import Path + + +def download_file(url, archive_filename): + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("") + + +def download_file_if_not_exists(url, archive_path): + if exists(archive_path): return + makedirs_if_not_exist(dirname(archive_path)) + download_file(url,archive_path) + + +def ls(dir, typecheck): + el = [f for f in listdir(dir) if typecheck(join(dir, f))] + el.sort() + return el + + +def list_dirs(dir): + return ls(dir, typecheck=isdir) + + +def list_files(dir): + return ls(dir, typecheck=isfile) + + +def makedirs_if_not_exist(path): + if not exists(path): makedirs(path) + + +def create_if_not_exist(path): + if not exists(path): makedirs(path) + + +def get_parent_name(path): + return Path(path).parent + + +def get_file_name(path): + return Path(path).name diff --git a/src/util/metrics.py b/src/util/metrics.py new file mode 100644 index 0000000..7a6079e --- /dev/null +++ b/src/util/metrics.py @@ -0,0 +1,152 @@ +import numpy as np + + +class ContTable: + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + def __add__(self, other): + return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) + + +def accuracy(cell): + return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) + + +def f1(cell): + num = 2.0 * cell.tp + den = 2.0 * cell.tp + cell.fp + cell.fn + if den > 0: + return num / den + # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative + return 1.0 + + +def K(cell): + specificity, recall = 0., 0. + + AN = cell.tn + cell.fp + if AN != 0: + specificity = cell.tn*1. / AN + + AP = cell.tp + cell.fn + if AP != 0: + recall = cell.tp*1. / AP + + if AP == 0: + return 2. * specificity - 1. + elif AN == 0: + return 2. * recall - 1. + else: + return specificity + recall - 1. + + +# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared +# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. +def __check_consistency_and_adapt(true_labels, predictions): + if predictions.ndim == 1: + return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) + if true_labels.ndim == 1: + return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions) + if true_labels.shape != predictions.shape: + raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." + % (true_labels.shape, predictions.shape)) + _, nC = true_labels.shape + return true_labels, predictions, nC + + +# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir +# probabilitiesfron with respect to the true binary labels +# true_labels and posterior_probabilities are two vectors of shape (number_documents,) +def soft_single_metric_statistics(true_labels, posterior_probabilities): + assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels." + tp = np.sum(posterior_probabilities[true_labels == 1]) + fn = np.sum(1. - posterior_probabilities[true_labels == 1]) + fp = np.sum(posterior_probabilities[true_labels == 0]) + tn = np.sum(1. - posterior_probabilities[true_labels == 0]) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + + +# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions +# true_labels and predicted_labels are two vectors of shape (number_documents,) +def hard_single_metric_statistics(true_labels, predicted_labels): + assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." + nd = len(true_labels) + tp = np.sum(predicted_labels[true_labels == 1]) + fp = np.sum(predicted_labels[true_labels == 0]) + fn = np.sum(true_labels[predicted_labels == 0]) + tn = nd - (tp+fp+fn) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + + +def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): + true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) + return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) + + +def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): + true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) + + accum = ContTable() + for c in range(nC): + other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) + accum = accum + other + + return metric(accum) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroF1(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def microF1(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroK(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, K) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def microK(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, K) diff --git a/src/util/pl_metrics.py b/src/util/pl_metrics.py new file mode 100644 index 0000000..bf8aa99 --- /dev/null +++ b/src/util/pl_metrics.py @@ -0,0 +1,141 @@ +import torch +from pytorch_lightning.metrics import Metric + +from util.common import is_false, is_true + + +def _update(pred, target, device): + assert pred.shape == target.shape + # preparing preds and targets for count + true_pred = is_true(pred, device) + false_pred = is_false(pred, device) + true_target = is_true(target, device) + false_target = is_false(target, device) + + tp = torch.sum(true_pred * true_target, dim=0) + tn = torch.sum(false_pred * false_target, dim=0) + fp = torch.sum(true_pred * false_target, dim=0) + fn = torch.sum(false_pred * target, dim=0) + return tp, tn, fp, fn + + +class CustomF1(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + Custom F1 metric. + Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. + I.e., when the number of true positives, false positives, and false negatives amount to 0, all + affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. + We adhere to the common practice of outputting 1 in this case since the classifier has correctly + classified all examples as negatives. + :param num_classes: + :param device: + :param average: + """ + super().__init__() + self.num_classes = num_classes + self.average = average + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) + + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) + + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative + + def compute(self): + if self.average == 'micro': + num = 2.0 * self.true_positive.sum() + den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() + if den > 0: + return (num / den).to(self.device) + return torch.FloatTensor([1.]).to(self.device) + if self.average == 'macro': + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + num = 2.0 * class_tp + den = 2.0 * class_tp + class_fp + class_fn + if den > 0: + class_specific.append(num / den) + else: + class_specific.append(1.) + average = torch.sum(torch.Tensor(class_specific))/self.num_classes + return average.to(self.device) + + +class CustomK(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + K metric. https://dl.acm.org/doi/10.1145/2808194.2809449 + :param num_classes: + :param device: + :param average: + """ + super().__init__() + self.num_classes = num_classes + self.average = average + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) + + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) + + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative + + def compute(self): + if self.average == 'micro': + specificity, recall = 0., 0. + absolute_negatives = self.true_negative.sum() + self.false_positive.sum() + if absolute_negatives != 0: + specificity = self.true_negative.sum()/absolute_negatives + absolute_positives = self.true_positive.sum() + self.false_negative.sum() + if absolute_positives != 0: + recall = self.true_positive.sum()/absolute_positives + + if absolute_positives == 0: + return 2. * specificity - 1 + elif absolute_negatives == 0: + return 2. * recall - 1 + else: + return specificity + recall - 1 + + if self.average == 'macro': + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + + specificity, recall = 0., 0. + absolute_negatives = class_tn + class_fp + if absolute_negatives != 0: + specificity = class_tn / absolute_negatives + absolute_positives = class_tp + class_fn + if absolute_positives != 0: + recall = class_tp / absolute_positives + + if absolute_positives == 0: + class_specific.append(2. * specificity - 1) + elif absolute_negatives == 0: + class_specific.append(2. * recall - 1) + else: + class_specific.append(specificity + recall - 1) + average = torch.sum(torch.Tensor(class_specific)) / self.num_classes + return average.to(self.device) diff --git a/src/util/results_csv.py b/src/util/results_csv.py new file mode 100644 index 0000000..be0ff84 --- /dev/null +++ b/src/util/results_csv.py @@ -0,0 +1,53 @@ +import os + +import numpy as np +import pandas as pd + + +class CSVlog: + def __init__(self, file, autoflush=True, verbose=False): + self.file = file + self.columns = ['method', + 'setting', + 'optimc', + 'sif', + 'zscore', + 'l2', + 'dataset', + 'time_tr', + 'time_te', + 'lang', + 'macrof1', + 'microf1', + 'macrok', + 'microk', + 'notes'] + self.autoflush = autoflush + self.verbose = verbose + if os.path.exists(file): + self.tell('Loading existing file from {}'.format(file)) + self.df = pd.read_csv(file, sep='\t') + else: + self.tell('File {} does not exist. Creating new frame.'.format(file)) + dir = os.path.dirname(self.file) + if dir and not os.path.exists(dir): os.makedirs(dir) + self.df = pd.DataFrame(columns=self.columns) + + def already_calculated(self, id): + return (self.df['id'] == id).any() + + def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, + macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, + macrof1, microf1, macrok, microk, notes], + index=self.columns) + self.df = self.df.append(s, ignore_index=True) + if self.autoflush: self.flush() + self.tell(s.to_string()) + + def flush(self): + self.df.to_csv(self.file, index=False, sep='\t') + + def tell(self, msg): + if self.verbose: + print(msg) diff --git a/src/util/standardizer.py b/src/util/standardizer.py new file mode 100644 index 0000000..429bccd --- /dev/null +++ b/src/util/standardizer.py @@ -0,0 +1,36 @@ +import numpy as np + + +class StandardizeTransformer: + def __init__(self, axis=0, range=None): + """ + + :param axis: + :param range: + """ + assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' + self.axis = axis + self.yetfit = False + self.range = range + + def fit(self, X): + print('Applying z-score standardization...') + std=np.std(X, axis=self.axis, ddof=1) + self.std = np.clip(std, 1e-5, None) + self.mean = np.mean(X, axis=self.axis) + if self.range is not None: + ones = np.ones_like(self.std) + zeros = np.zeros_like(self.mean) + ones[self.range] = self.std[self.range] + zeros[self.range] = self.mean[self.range] + self.std = ones + self.mean = zeros + self.yetfit=True + return self + + def transform(self, X): + if not self.yetfit: 'transform called before fit' + return (X - self.mean) / self.std + + def fit_transform(self, X): + return self.fit(X).transform(X) \ No newline at end of file diff --git a/src/view_generators.py b/src/view_generators.py new file mode 100644 index 0000000..384ec76 --- /dev/null +++ b/src/view_generators.py @@ -0,0 +1,375 @@ +""" +This module contains the view generators that take care of computing the view specific document embeddings: + +- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. + +- WordClassGen (-w): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + +- MuseGen (-m): generates document representation via MUSE embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + +- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). + +- View generator (-b): generates document embedding via mBERT model. +""" +from abc import ABC, abstractmethod +from time import time + +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import TensorBoardLogger + +from data.datamodule import RecurrentDataModule, BertDataModule, tokenize +from models.learners import * +from models.pl_bert import BertModel +from models.pl_gru import RecurrentModel +from util.common import TfidfVectorizerMultilingual, _normalize +from util.embeddings_manager import MuseLoader, XdotM, wce_matrix + + +class ViewGen(ABC): + """ + Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to + be seamlessly integrated in the overall architecture. + """ + @abstractmethod + def fit(self, lX, ly): + pass + + @abstractmethod + def transform(self, lX): + pass + + @abstractmethod + def fit_transform(self, lX, ly): + pass + + +class VanillaFunGen(ViewGen): + """ + View Generator (x): original funnelling architecture proposed by Moreo, Esuli and + Sebastiani in DOI: https://doi.org/10.1145/3326065 + """ + def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): + """ + Init Posterior Probabilities embedder (i.e., VanillaFunGen) + :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to + return posterior probabilities. + :param base_learner: + :param n_jobs: integer, number of concurrent workers + """ + super().__init__() + self.learners = base_learner + self.first_tier_parameters = first_tier_parameters + self.n_jobs = n_jobs + self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners, + parameters=self.first_tier_parameters, n_jobs=self.n_jobs) + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, lY): + print('# Fitting VanillaFunGen (X)...') + lX = self.vectorizer.fit_transform(lX) + self.doc_projector.fit(lX, lY) + return self + + def transform(self, lX): + """ + (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization + to the projection and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ + lX = self.vectorizer.transform(lX) + lZ = self.doc_projector.predict_proba(lX) + lZ = _normalize(lZ, l2=True) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class MuseGen(ViewGen): + """ + View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ + def __init__(self, muse_dir='../embeddings', n_jobs=-1): + """ + Init the MuseGen. + :param muse_dir: string, path to folder containing muse embeddings + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.muse_dir = muse_dir + self.n_jobs = n_jobs + self.langs = None + self.lMuse = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting MuseGen (M)...') + self.vectorizer.fit(lX) + self.langs = sorted(lX.keys()) + self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) + lVoc = self.vectorizer.vocabulary() + self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words + # TODO: featureweight.fit + return self + + def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ + lX = self.vectorizer.transform(lX) + XdotMUSE = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) + lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} + lZ = _normalize(lZ, l2=True) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class WordClassGen(ViewGen): + """ + View Generator (w): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ + def __init__(self, n_jobs=-1): + """ + Init WordClassGen. + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.n_jobs = n_jobs + self.langs = None + self.lWce = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting WordClassGen (W)...') + lX = self.vectorizer.fit_transform(lX) + self.langs = sorted(lX.keys()) + wce = Parallel(n_jobs=self.n_jobs)( + delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs) + self.lWce = {l: wce[i] for i, l in enumerate(self.langs)} + # TODO: featureweight.fit() + return self + + def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ + lX = self.vectorizer.transform(lX) + XdotWce = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) + lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)} + lWce = _normalize(lWce, l2=True) + return lWce + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class RecurrentGen(ViewGen): + """ + View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns + the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. + """ + def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, + gpus=0, n_jobs=-1, stored_path=None): + """ + Init RecurrentGen. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use + as embedding layer. + :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised + embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to + the number of target classes. + :param batch_size: int, number of samples in a batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. + """ + super().__init__() + self.multilingualIndex = multilingualIndex + self.langs = multilingualIndex.langs + self.batch_size = batch_size + self.gpus = gpus + self.n_jobs = n_jobs + self.stored_path = stored_path + self.nepochs = nepochs + + # EMBEDDINGS to be deployed + self.pretrained = pretrained_embeddings + self.wce = wce + + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) + self.model = self._init_model() + self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) + # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') + + def _init_model(self): + if self.stored_path: + lpretrained = self.multilingualIndex.l_embeddings() + return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained) + else: + lpretrained = self.multilingualIndex.l_embeddings() + langs = self.multilingualIndex.langs + output_size = self.multilingualIndex.get_target_dim() + hidden_size = 512 + lvocab_size = self.multilingualIndex.l_vocabsize() + learnable_length = 0 + return RecurrentModel( + lPretrained=lpretrained, + langs=langs, + output_size=output_size, + hidden_size=hidden_size, + lVocab_size=lvocab_size, + learnable_length=learnable_length, + drop_embedding_range=self.multilingualIndex.sup_range, + drop_embedding_prop=0.5, + gpus=self.gpus + ) + + def fit(self, lX, ly): + """ + Train the Neural Network end-to-end. + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting RecurrentGen (G)...') + recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) + trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, + checkpoint_callback=False) + + # vanilla_torch_model = torch.load( + # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') + # self.model.linear0 = vanilla_torch_model.linear0 + # self.model.linear1 = vanilla_torch_model.linear1 + # self.model.linear2 = vanilla_torch_model.linear2 + # self.model.rnn = vanilla_torch_model.rnn + + trainer.fit(self.model, datamodule=recurrentDataModule) + trainer.test(self.model, datamodule=recurrentDataModule) + return self + + def transform(self, lX): + """ + Project documents to the common latent space. Output dimensionality is 512. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. + """ + l_pad = self.multilingualIndex.l_pad() + data = self.multilingualIndex.l_devel_index() + self.model.to('cuda' if self.gpus else 'cpu') + self.model.eval() + time_init = time() + l_embeds = self.model.encode(data, l_pad, batch_size=256) + transform_time = round(time() - time_init, 3) + print(f'Executed! Transform took: {transform_time}') + return l_embeds + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class BertGen(ViewGen): + """ + View Generator (b): generates document embedding via Bert model. The training happens end-to-end. + At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document + embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. + """ + def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): + """ + Init Bert model + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batch_size: int, number of samples per batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers. + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. + """ + super().__init__() + self.multilingualIndex = multilingualIndex + self.nepochs = nepochs + self.gpus = gpus + self.batch_size = batch_size + self.n_jobs = n_jobs + self.stored_path = stored_path + self.model = self._init_model() + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) + + def _init_model(self): + output_size = self.multilingualIndex.get_target_dim() + return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) + + def fit(self, lX, ly): + """ + Train the Neural Network end-to-end. + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting BertGen (M)...') + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) + trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, + logger=self.logger, checkpoint_callback=False) + trainer.fit(self.model, datamodule=bertDataModule) + trainer.test(self.model, datamodule=bertDataModule) + return self + + def transform(self, lX): + """ + Project documents to the common latent space. Output dimensionality is 768. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. + """ + data = self.multilingualIndex.l_devel_raw_index() + data = tokenize(data, max_len=512) + self.model.to('cuda' if self.gpus else 'cpu') + self.model.eval() + time_init = time() + l_emebds = self.model.encode(data, batch_size=64) + transform_time = round(time() - time_init, 3) + print(f'Executed! Transform took: {transform_time}') + return l_emebds + + def fit_transform(self, lX, ly): + # we can assume that we have already indexed data for transform() since we are first calling fit() + return self.fit(lX, ly).transform(lX) + + From ca179aca233a0eccfd7f32ce69a045b211a390d7 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 15:12:28 +0100 Subject: [PATCH 37/55] merged with refactor --- refactor/data/__init__.py | 0 refactor/data/datamodule.py | 222 ------- refactor/data/dataset_builder.py | 712 ----------------------- refactor/data/languages.py | 42 -- refactor/data/reader/__init__.py | 0 refactor/data/reader/jrcacquis_reader.py | 324 ----------- refactor/data/reader/rcv_reader.py | 222 ------- refactor/data/reader/wikipedia_tools.py | 307 ---------- refactor/data/text_preprocessor.py | 34 -- refactor/data/tsr_function__.py | 271 --------- refactor/funnelling.py | 124 ---- refactor/main.py | 167 ------ refactor/models/helpers.py | 51 -- refactor/models/learners.py | 224 ------- refactor/models/lstm_class.py | 113 ---- refactor/models/pl_bert.py | 183 ------ refactor/models/pl_gru.py | 266 --------- refactor/requirements.txt | 12 - refactor/run.sh | 6 - refactor/util/SIF_embed.py | 59 -- refactor/util/common.py | 384 ------------ refactor/util/embeddings_manager.py | 104 ---- refactor/util/evaluation.py | 20 - refactor/util/file.py | 50 -- refactor/util/metrics.py | 152 ----- refactor/util/pl_metrics.py | 141 ----- refactor/util/results_csv.py | 53 -- refactor/util/standardizer.py | 36 -- refactor/view_generators.py | 375 ------------ 29 files changed, 4654 deletions(-) delete mode 100644 refactor/data/__init__.py delete mode 100644 refactor/data/datamodule.py delete mode 100644 refactor/data/dataset_builder.py delete mode 100644 refactor/data/languages.py delete mode 100644 refactor/data/reader/__init__.py delete mode 100644 refactor/data/reader/jrcacquis_reader.py delete mode 100644 refactor/data/reader/rcv_reader.py delete mode 100644 refactor/data/reader/wikipedia_tools.py delete mode 100644 refactor/data/text_preprocessor.py delete mode 100755 refactor/data/tsr_function__.py delete mode 100644 refactor/funnelling.py delete mode 100644 refactor/main.py delete mode 100755 refactor/models/helpers.py delete mode 100644 refactor/models/learners.py delete mode 100755 refactor/models/lstm_class.py delete mode 100644 refactor/models/pl_bert.py delete mode 100644 refactor/models/pl_gru.py delete mode 100644 refactor/requirements.txt delete mode 100644 refactor/run.sh delete mode 100644 refactor/util/SIF_embed.py delete mode 100644 refactor/util/common.py delete mode 100644 refactor/util/embeddings_manager.py delete mode 100644 refactor/util/evaluation.py delete mode 100644 refactor/util/file.py delete mode 100644 refactor/util/metrics.py delete mode 100644 refactor/util/pl_metrics.py delete mode 100644 refactor/util/results_csv.py delete mode 100644 refactor/util/standardizer.py delete mode 100644 refactor/view_generators.py diff --git a/refactor/data/__init__.py b/refactor/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py deleted file mode 100644 index da6ec92..0000000 --- a/refactor/data/datamodule.py +++ /dev/null @@ -1,222 +0,0 @@ -import numpy as np -import pytorch_lightning as pl -import torch -from torch.utils.data import Dataset, DataLoader -from transformers import BertTokenizer - -N_WORKERS = 8 - - -class RecurrentDataset(Dataset): - def __init__(self, lX, ly, lPad_index): - """ - :param lX: dict {lang_id : np.ndarray} - :param ly: - """ - self.lX = [] - self.ly = [] - self.lOffset = {} - self.lPad_index = lPad_index - - for lang, data in lX.items(): - offset = [len(self.lX)] - self.lX.extend(data) - offset.append(len(self.lX)) - self.lOffset[lang] = offset - - for lang, target in ly.items(): - self.ly.extend(target) - - def __len__(self): - return len(self.lX) - - def __getitem__(self, index): - X = self.lX[index] - y = self.ly[index] - return X, y, index, self._get_lang(index) - - def _get_lang(self, index): - for lang, l_range in self.lOffset.items(): - if index in range(l_range[0], l_range[1]): - return lang - - def collate_fn(self, data): - """ - Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch} - items sampled from the Dataset class. - :param data: - :return: - """ - lX_batch = {} - ly_batch = {} - current_lang = data[0][-1] - for d in data: - if d[-1] == current_lang: - if current_lang not in lX_batch.keys(): - lX_batch[current_lang] = [] - ly_batch[current_lang] = [] - lX_batch[current_lang].append(d[0]) - ly_batch[current_lang].append(d[1]) - else: - current_lang = d[-1] - lX_batch[current_lang] = [] - ly_batch[current_lang] = [] - lX_batch[current_lang].append(d[0]) - ly_batch[current_lang].append(d[1]) - - for lang in lX_batch.keys(): - lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], - max_pad_length=self.define_pad_length(lX_batch[lang])) - lX_batch[lang] = torch.LongTensor(lX_batch[lang]) - ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) - - return lX_batch, ly_batch - - @staticmethod - def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths) + np.std(lengths)) - - @staticmethod - def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i, indexes in enumerate(index_list): - index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] - return index_list - - -class RecurrentDataModule(pl.LightningDataModule): - """ - Pytorch Lightning Datamodule to be deployed with RecurrentGen. - https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html - """ - def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1): - """ - Init RecurrentDataModule. - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param batchsize: int, number of sample per batch. - :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading). - """ - self.multilingualIndex = multilingualIndex - self.batchsize = batchsize - self.n_jobs = n_jobs - super().__init__() - - def prepare_data(self, *args, **kwargs): - pass - - def setup(self, stage=None): - if stage == 'fit' or stage is None: - l_train_index, l_train_target = self.multilingualIndex.l_train() - # Debug settings: reducing number of samples - l_train_index = {l: train[:5] for l, train in l_train_index.items()} - l_train_target = {l: target[:5] for l, target in l_train_target.items()} - - self.training_dataset = RecurrentDataset(l_train_index, l_train_target, - lPad_index=self.multilingualIndex.l_pad()) - - l_val_index, l_val_target = self.multilingualIndex.l_val() - # Debug settings: reducing number of samples - l_val_index = {l: train[:5] for l, train in l_val_index.items()} - l_val_target = {l: target[:5] for l, target in l_val_target.items()} - - self.val_dataset = RecurrentDataset(l_val_index, l_val_target, - lPad_index=self.multilingualIndex.l_pad()) - if stage == 'test' or stage is None: - l_test_index, l_test_target = self.multilingualIndex.l_test() - # Debug settings: reducing number of samples - l_test_index = {l: train[:5] for l, train in l_test_index.items()} - l_test_target = {l: target[:5] for l, target in l_test_target.items()} - - self.test_dataset = RecurrentDataset(l_test_index, l_test_target, - lPad_index=self.multilingualIndex.l_pad()) - - def train_dataloader(self): - return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, - collate_fn=self.training_dataset.collate_fn) - - def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, - collate_fn=self.val_dataset.collate_fn) - - def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, - collate_fn=self.test_dataset.collate_fn) - - -def tokenize(l_raw, max_len): - """ - run Bert tokenization on dict {lang: list of samples}. - :param l_raw: - :param max_len: - :return: - """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - l_tokenized = {} - for lang in l_raw.keys(): - output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') - l_tokenized[lang] = output_tokenizer['input_ids'] - return l_tokenized - - -class BertDataModule(RecurrentDataModule): - """ - Pytorch Lightning Datamodule to be deployed with BertGen. - https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html - """ - def __init__(self, multilingualIndex, batchsize=64, max_len=512): - """ - Init BertDataModule. - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param batchsize: int, number of sample per batch. - :param max_len: int, max number of token per document. Absolute cap is 512. - """ - super().__init__(multilingualIndex, batchsize) - self.max_len = max_len - - def setup(self, stage=None): - if stage == 'fit' or stage is None: - l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() - # Debug settings: reducing number of samples - l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} - l_train_target = {l: target[:5] for l, target in l_train_target.items()} - - l_train_index = tokenize(l_train_raw, max_len=self.max_len) - self.training_dataset = RecurrentDataset(l_train_index, l_train_target, - lPad_index=self.multilingualIndex.l_pad()) - - l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() - # Debug settings: reducing number of samples - l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} - l_val_target = {l: target[:5] for l, target in l_val_target.items()} - - l_val_index = tokenize(l_val_raw, max_len=self.max_len) - self.val_dataset = RecurrentDataset(l_val_index, l_val_target, - lPad_index=self.multilingualIndex.l_pad()) - - if stage == 'test' or stage is None: - l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() - # Debug settings: reducing number of samples - l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} - l_test_target = {l: target[:5] for l, target in l_test_target.items()} - - l_test_index = tokenize(l_test_raw, max_len=self.max_len) - self.test_dataset = RecurrentDataset(l_test_index, l_test_target, - lPad_index=self.multilingualIndex.l_pad()) - - def train_dataloader(self): - """ - NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" - :return: - """ - return DataLoader(self.training_dataset, batch_size=self.batchsize) - - def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize) - - def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize) diff --git a/refactor/data/dataset_builder.py b/refactor/data/dataset_builder.py deleted file mode 100644 index 0e91316..0000000 --- a/refactor/data/dataset_builder.py +++ /dev/null @@ -1,712 +0,0 @@ -import itertools -import pickle -import re -from os.path import exists - -import numpy as np -from nltk.corpus import stopwords -from scipy.sparse import csr_matrix -from scipy.sparse import issparse -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import MultiLabelBinarizer -from tqdm import tqdm - -from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.jrcacquis_reader import * -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents - - -class MultilingualDataset: - """ - A multilingual dataset is a dictionary of training and test documents indexed by language code. - Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the - documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the - labels of each document, and ids is a list of document-identifiers from the original collection. - """ - - def __init__(self): - self.dataset_name = "" - self.multiling_dataset = {} - - def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): - self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) - - def save(self, file): - self.sort_indexes() - pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) - return self - - def __getitem__(self, item): - if item in self.langs(): - return self.multiling_dataset[item] - return None - - @classmethod - def load(cls, file): - data = pickle.load(open(file, 'rb')) - data.sort_indexes() - return data - - @classmethod - def load_ids(cls, file): - data = pickle.load(open(file, 'rb')) - tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} - te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} - return tr_ids, te_ids - - def sort_indexes(self): - for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): - if issparse(Xtr): Xtr.sort_indices() - if issparse(Xte): Xte.sort_indices() - - def set_view(self, categories=None, languages=None): - if categories is not None: - if isinstance(categories, int): - categories = np.array([categories]) - elif isinstance(categories, list): - categories = np.array(categories) - self.categories_view = categories - if languages is not None: - self.languages_view = languages - - def training(self, mask_numbers=False, target_as_csr=False): - return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) - - def test(self, mask_numbers=False, target_as_csr=False): - return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) - - def lXtr(self, mask_numbers=False): - proc = lambda x:_mask_numbers(x) if mask_numbers else x - # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - - def lXte(self, mask_numbers=False): - proc = lambda x: _mask_numbers(x) if mask_numbers else x - # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} - - def lYtr(self, as_csr=False): - lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def lYte(self, as_csr=False): - lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def cat_view(self, Y): - if hasattr(self, 'categories_view'): - return Y[:,self.categories_view] - else: - return Y - - def langs(self): - if hasattr(self, 'languages_view'): - langs = self.languages_view - else: - langs = sorted(self.multiling_dataset.keys()) - return langs - - def num_categories(self): - return self.lYtr()[self.langs()[0]].shape[1] - - def show_dimensions(self): - def shape(X): - return X.shape if hasattr(X, 'shape') else len(X) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) - - def show_category_prevalences(self): - nC = self.num_categories() - accum_tr = np.zeros(nC, dtype=np.int) - accum_te = np.zeros(nC, dtype=np.int) - in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - prev_train = np.sum(self.cat_view(Ytr), axis=0) - prev_test = np.sum(self.cat_view(Yte), axis=0) - accum_tr += prev_train - accum_te += prev_test - in_langs += (prev_train>0)*1 - print(lang+'-train', prev_train) - print(lang+'-test', prev_test) - print('all-train', accum_tr) - print('all-test', accum_te) - - return accum_tr, accum_te, in_langs - - def set_labels(self, labels): - self.labels = labels - -def _mask_numbers(data): - mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') - mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') - mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') - mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') - mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - text = ' ' + text - text = mask_moredigit.sub(' MoreDigitMask', text) - text = mask_4digit.sub(' FourDigitMask', text) - text = mask_3digit.sub(' ThreeDigitMask', text) - text = mask_2digit.sub(' TwoDigitMask', text) - text = mask_1digit.sub(' OneDigitMask', text) - masked.append(text.replace('.','').replace(',','').strip()) - return masked - - - - -# ---------------------------------------------------------------------------------------------------------------------- -# Helpers -# ---------------------------------------------------------------------------------------------------------------------- -def get_active_labels(doclist): - cat_list = set() - for d in doclist: - cat_list.update(d.categories) - return list(cat_list) - -def filter_by_categories(doclist, keep_categories): - catset = frozenset(keep_categories) - for d in doclist: - d.categories = list(set(d.categories).intersection(catset)) - -def __years_to_str(years): - if isinstance(years, list): - if len(years) > 1: - return str(years[0])+'-'+str(years[-1]) - return str(years[0]) - return str(years) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Matrix builders -# ---------------------------------------------------------------------------------------------------------------------- -def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are independent of each other, - i.e., each language-specific matrix lies in a dedicate feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - lW = {} - - multilingual_dataset = MultilingualDataset() - multilingual_dataset.dataset_name = dataset_name - multilingual_dataset.set_labels(mlb.classes_) - for lang in langs: - print("\nprocessing %d training, %d test, %d wiki for language <%s>" % - (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) - - tr_data, tr_labels, IDtr = zip(*training_docs[lang]) - te_data, te_labels, IDte = zip(*test_docs[lang]) - - if preprocess: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, - tokenizer=NLTKStemTokenizer(lang, verbose=True), - stop_words=stopwords.words(NLTK_LANGMAP[lang])) - else: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - - Xtr = tfidf.fit_transform(tr_data) - Xte = tfidf.transform(te_data) - if wiki_docs: - lW[lang] = tfidf.transform(wiki_docs[lang]) - - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - - multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - multilingual_dataset.show_dimensions() - multilingual_dataset.show_category_prevalences() - - if wiki_docs: - return multilingual_dataset, lW - else: - return multilingual_dataset - - -# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space -def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, - since all of them lie on the same yuxtaposed feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - multiling_dataset = MultilingualDataset() - multiling_dataset.dataset_name = dataset_name - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - multiling_dataset.set_labels(mlb.classes_) - - tr_data_stack = [] - for lang in langs: - print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) - tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) - te_data, te_labels, te_ID = zip(*test_docs[lang]) - if preprocess: - tr_data = preprocess_documents(tr_data, lang) - te_data = preprocess_documents(te_data, lang) - tr_data_stack.extend(tr_data) - multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) - - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - tfidf.fit(tr_data_stack) - - for lang in langs: - print("\nweighting documents for language <%s>" % (lang)) - (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] - Xtr = tfidf.transform(tr_data) - Xte = tfidf.transform(te_data) - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) - - multiling_dataset.show_dimensions() - return multiling_dataset - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to recover the original documents from the MultilingualDataset's ids -# ---------------------------------------------------------------------------------------------------------------------- -""" -This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent -article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents -from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath -""" -def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - all_docs = rcv1_documents + rcv2_documents - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -""" -Same thing but for JRC-Acquis -""" -def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - def filter_by_id(doclist, ids): - ids_set = frozenset(itertools.chain.from_iterable(ids.values())) - return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] - - training_docs = filter_by_id(training_docs, tr_ids) - test_docs = filter_by_id(test_docs, te_ids) - - print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -# ---------------------------------------------------------------------------------------------------------------------- -# Dataset Generators -# ---------------------------------------------------------------------------------------------------------------------- -def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - - - """ - Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - In all cases, training documents are strictly non-parallel, and test documents are strictly parallel - :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where - all splits will be generated - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_years: a list of ints containing the years to be considered as training documents - :param test_years: a list of ints containing the years to be considered as test documents - :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" - (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the - leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details - :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - name = 'JRCacquis' - run = '_run' + str(run) - config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ - 'vs' + __years_to_str(test_years) + \ - '_' + cat_policy + \ - ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ - '_noparallel_processed' - - indep_path = join(jrc_data_home, config_name + run + '.pickle') - upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') - yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') - wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') - wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - print('Generating feature-independent dataset...') - training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) - - def _group_by_lang(doc_list, langs): - return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] - for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) - test_docs = _group_by_lang(test_docs, langs) - if not exists(indep_path): - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) - pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) - - lang_data.save(indep_path) - - print('Generating upper-bound (English-only) dataset...') - if not exists(upper_path): - training_docs_eng_only = {'en':training_docs['en']} - test_docs_eng_only = {'en':test_docs['en']} - build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) - - print('Generating yuxtaposed dataset...') - if not exists(yuxta_path): - build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) - - -def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, - train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - """ - Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - - :param outpath: path where all splits will be dumped - :param rcv1_data_home: path to the RCV1-v2 dataset (English only) - :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_for_lang: maximum number of training documents per language - :param test_for_lang: maximum number of test documents per language - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' - assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' - assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ - "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" - - name = 'RCV1/2' - run = '_run' + str(run) - config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ - ('_processed' if preprocess else '_raw') - - indep_path = join(outpath, config_name + run + '.pickle') - upper_path = join(outpath, config_name + run +'_upper.pickle') - yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') - wiki_path = join(outpath, config_name + run + '.wiki.pickle') - wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents+rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} - - # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there - # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases - print('Generating upper-bound (English-only) dataset...') - train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) - train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} - test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} - build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) - - train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] - for lang in langs: - if lang=='en': continue # already split - test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) - train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) - train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] - test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] - - print('Generating feature-independent dataset...') - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - - lang_data.save(indep_path) - - print('Generating yuxtaposed dataset...') - build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to generate full RCV and JRC datasets -# ---------------------------------------------------------------------------------------------------------------------- -def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): - - - print('fetching the datasets') - rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_train_documents, labels_rcv2) - filter_by_categories(rcv1_test_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_train_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents - lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} - - def get_ids(doclist): - return frozenset([d.id for d in doclist]) - - tr_ids = {'en': get_ids(rcv1_train_documents)} - te_ids = {'en': get_ids(rcv1_test_documents)} - for lang in langs: - if lang == 'en': continue - tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) - - dataset = MultilingualDataset() - dataset.dataset_name = 'RCV1/2-full' - for lang in langs: - print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): - - print('fetching the datasets') - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat - ) - test_docs, _ = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' - ) - - def _group_by_lang(doc_list, langs): - return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - test_docs = _group_by_lang(test_docs, langs) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - data.dataset_name = 'JRC-Acquis-full' - for lang in langs: - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) - Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -#----------------------------------------------------------------------------------------------------------------------- -# MAIN BUILDER -#----------------------------------------------------------------------------------------------------------------------- - -if __name__=='__main__': - import sys - RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = '../Datasets/RCV2' - JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" - full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) - # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) - sys.exit(0) - - # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' - # data = MultilingualDataset.load(datasetpath) - # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' - # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: - # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] - # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) - # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') - # sys.exit(0) - - assert len(sys.argv) == 5, "wrong number of arguments; required: " \ - " " - - JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" - RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' - WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" - - langs = lang_set['JRC_NLTK'] - max_wiki = 5000 - - for run in range(0,10): - print('Building JRC-Acquis datasets run', run) - prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, - train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, - cat_policy='all', most_common_cat=300, run=run) - - print('Building RCV1-v2/2 datasets run', run) - prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], - train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) - - # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE - # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) - # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_','_doclist_') - # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) - - # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_', '_doclist_') - # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) - - - diff --git a/refactor/data/languages.py b/refactor/data/languages.py deleted file mode 100644 index 2d03d8e..0000000 --- a/refactor/data/languages.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', - 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} - - -#top 10 languages in wikipedia order by the number of articles -#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] - -#all languages in JRC-acquis v3 -JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] -JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' - -RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] -RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] - -lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, - 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} - diff --git a/refactor/data/reader/__init__.py b/refactor/data/reader/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/refactor/data/reader/jrcacquis_reader.py b/refactor/data/reader/jrcacquis_reader.py deleted file mode 100644 index e911996..0000000 --- a/refactor/data/reader/jrcacquis_reader.py +++ /dev/null @@ -1,324 +0,0 @@ -from __future__ import print_function - -import os -import pickle -import sys -import tarfile -import xml.etree.ElementTree as ET -import zipfile -from collections import Counter -from os.path import join -from random import shuffle - -import rdflib -from rdflib.namespace import RDF, SKOS -from sklearn.datasets import get_data_home - -from data.languages import JRC_LANGS -from data.languages import lang_set -from util.file import download_file, list_dirs, list_files - -""" -JRC Acquis' Nomenclature: -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -class JRCAcquis_Document: - def __init__(self, id, name, lang, year, head, body, categories): - self.id = id - self.parallel_id = name - self.lang = lang - self.year = year - self.text = body if not head else head + "\n" + body - self.categories = categories - -# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles -# however, it seems that the title is often appearing as the first paragraph in the text/body (with -# standard codification), so it might be preferable not to read the header after all (as here by default) -def _proc_acute(text): - for ch in ['a','e','i','o','u']: - text = text.replace('%'+ch+'acute%',ch) - return text - -def parse_document(file, year, head=False): - root = ET.parse(file).getroot() - - doc_name = root.attrib['n'] # e.g., '22006A0211(01)' - doc_lang = root.attrib['lang'] # e.g., 'es' - doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' - doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] - doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' - doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) - - def raise_if_empty(field, from_file): - if isinstance(field, str): - if not field.strip(): - raise ValueError("Empty field in file %s" % from_file) - - raise_if_empty(doc_name, file) - raise_if_empty(doc_lang, file) - raise_if_empty(doc_id, file) - if head: raise_if_empty(doc_head, file) - raise_if_empty(doc_body, file) - - return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) - -# removes documents without a counterpart in all other languages -def _force_parallel(doclist, langs): - n_langs = len(langs) - par_id_count = Counter([d.parallel_id for d in doclist]) - parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) - return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] - -def random_sampling_avoiding_parallel(doclist): - random_order = list(range(len(doclist))) - shuffle(random_order) - sampled_request = [] - parallel_ids = set() - for ind in random_order: - pid = doclist[ind].parallel_id - if pid not in parallel_ids: - sampled_request.append(doclist[ind]) - parallel_ids.add(pid) - print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) - return sampled_request - - -#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter -def _filter_by_category(doclist, cat_filter): - if not isinstance(cat_filter, frozenset): - cat_filter = frozenset(cat_filter) - filtered = [] - for doc in doclist: - doc.categories = list(cat_filter & set(doc.categories)) - if doc.categories: - doc.categories.sort() - filtered.append(doc) - print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) - return filtered - -#filters out categories with less than cat_threshold documents (and filters documents containing those categories) -def _filter_by_frequency(doclist, cat_threshold): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -#select top most_frequent categories (and filters documents containing those categories) -def _most_common(doclist, most_frequent): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -def _get_categories(request): - final_cats = set() - for d in request: - final_cats.update(d.categories) - return list(final_cats) - -def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, - parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): - - assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' - if not langs: - langs = JRC_LANGS - else: - if isinstance(langs, str): langs = [langs] - for l in langs: - if l not in JRC_LANGS: - raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) - - if not data_path: - data_path = get_data_home() - - if not os.path.exists(data_path): - os.mkdir(data_path) - - request = [] - total_read = 0 - for l in langs: - file_name = 'jrc-'+l+'.tgz' - archive_path = join(data_path, file_name) - - if not os.path.exists(archive_path): - print("downloading language-specific dataset (once and for all) into %s" % data_path) - DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) - download_file(DOWNLOAD_URL, archive_path) - print("untarring dataset...") - tarfile.open(archive_path, 'r:gz').extractall(data_path) - - documents_dir = join(data_path, l) - - print("Reading documents...") - read = 0 - for dir in list_dirs(documents_dir): - year = int(dir) - if years==None or year in years: - year_dir = join(documents_dir,dir) - pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') - if os.path.exists(pickle_name): - print("loading from file %s" % pickle_name) - l_y_documents = pickle.load(open(pickle_name, "rb")) - read += len(l_y_documents) - else: - l_y_documents = [] - all_documents = list_files(year_dir) - empty = 0 - for i,doc_file in enumerate(all_documents): - try: - jrc_doc = parse_document(join(year_dir, doc_file), year) - except ValueError: - jrc_doc = None - - if jrc_doc and (not ignore_unclassified or jrc_doc.categories): - l_y_documents.append(jrc_doc) - else: empty += 1 - if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): - print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') - read+=1 - print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') - print("\t\t(Pickling object for future runs in %s)" % pickle_name) - pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - request += l_y_documents - print("Read %d documents for language %s\n" % (read, l)) - total_read += read - print("Read %d documents in total" % (total_read)) - - if parallel=='force': - request = _force_parallel(request, langs) - elif parallel == 'avoid': - request = random_sampling_avoiding_parallel(request) - - final_cats = _get_categories(request) - - if cat_filter: - request = _filter_by_category(request, cat_filter) - final_cats = _get_categories(request) - if cat_threshold > 0: - request, final_cats = _filter_by_frequency(request, cat_threshold) - if most_frequent != -1 and len(final_cats) > most_frequent: - request, final_cats = _most_common(request, most_frequent) - - return request, final_cats - -def print_cat_analysis(request): - cat_count = Counter() - for d in request: - cat_count.update(d.categories) - print("Number of active categories: {}".format(len(cat_count))) - print(cat_count.most_common()) - -# inspects the Eurovoc thesaurus in order to select a subset of categories -# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented -def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', - eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", - select="broadest"): - - fullpath_pickle = join(data_path, select+'_concepts.pickle') - if os.path.exists(fullpath_pickle): - print("Pickled object found in %s. Loading it." % fullpath_pickle) - return pickle.load(open(fullpath_pickle,'rb')) - - fullpath = join(data_path, eurovoc_skos_core_concepts_filename) - if not os.path.exists(fullpath): - print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) - download_file(eurovoc_url, fullpath) - print("Unzipping file...") - zipped = zipfile.ZipFile(data_path + '.zip', 'r') - zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) - zipped.close() - - print("Parsing %s" %fullpath) - g = rdflib.Graph() - g.parse(location=fullpath, format="application/rdf+xml") - - if select == "all": - print("Selecting all concepts") - all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) - all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] - all_concepts.sort() - selected_concepts = all_concepts - elif select=="broadest": - print("Selecting broadest concepts (those without any other broader concept linked to it)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - narrower_concepts = set(g.subjects(SKOS.broader, None)) - broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] - broadest_concepts.sort() - selected_concepts = broadest_concepts - elif select=="leaves": - print("Selecting leaves concepts (those not linked as broader of any other concept)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - broad_concepts = set(g.objects(None, SKOS.broader)) - leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] - leave_concepts.sort() - selected_concepts = leave_concepts - else: - raise ValueError("Selection policy %s is not currently supported" % select) - - print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) - print("Pickling concept list for faster further requests in %s" % fullpath_pickle) - pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) - - return selected_concepts - -if __name__ == '__main__': - - def single_label_fragment(doclist): - single = [d for d in doclist if len(d.categories) < 2] - final_categories = set([d.categories[0] if d.categories else [] for d in single]) - print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), - len(final_categories), - len(doclist))) - return single, list(final_categories) - - train_years = list(range(1986, 2006)) - test_years = [2006] - cat_policy = 'leaves' - most_common_cat = 300 - # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" - JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" - langs = lang_set['JRC_NLTK'] - cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) - sys.exit() - - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) - test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - training_docs, label_names = single_label_fragment(training_docs) - test_docs, label_namestest = single_label_fragment(test_docs) - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - diff --git a/refactor/data/reader/rcv_reader.py b/refactor/data/reader/rcv_reader.py deleted file mode 100644 index b3db098..0000000 --- a/refactor/data/reader/rcv_reader.py +++ /dev/null @@ -1,222 +0,0 @@ -import re -import xml.etree.ElementTree as ET -from os.path import join, exists -from zipfile import ZipFile - -import numpy as np - -from util.file import download_file_if_not_exists -from util.file import list_files - -""" -RCV2's Nomenclature: -ru = Russian -da = Danish -de = German -es = Spanish -lat = Spanish Latin-American (actually is also 'es' in the collection) -fr = French -it = Italian -nl = Dutch -pt = Portuguese -sv = Swedish -ja = Japanese -htw = Chinese -no = Norwegian -""" - -RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" -RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' -RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" -RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" - -rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', - 'lyrl2004_tokens_test_pt1.dat.gz', - 'lyrl2004_tokens_test_pt2.dat.gz', - 'lyrl2004_tokens_test_pt3.dat.gz'] - -rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] - -rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' - -RCV2_LANG_DIR = {'ru':'REUTE000', - 'de':'REUTE00A', - 'fr':'REUTE00B', - 'sv':'REUTE001', - 'no':'REUTE002', - 'da':'REUTE003', - 'pt':'REUTE004', - 'it':'REUTE005', - 'es':'REUTE006', - 'lat':'REUTE007', - 'jp':'REUTE008', - 'htw':'REUTE009', - 'nl':'REUTERS_'} - - -class RCV_Document: - - def __init__(self, id, text, categories, date='', lang=None): - self.id = id - self.date = date - self.lang = lang - self.text = text - self.categories = categories - - -class ExpectedLanguageException(Exception): pass -class IDRangeException(Exception): pass - - -nwords = [] - -def parse_document(xml_content, assert_lang=None, valid_id_range=None): - root = ET.fromstring(xml_content) - if assert_lang: - if assert_lang not in root.attrib.values(): - if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' - raise ExpectedLanguageException('error: document of a different language') - - doc_id = root.attrib['itemid'] - if valid_id_range is not None: - if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: - raise IDRangeException - - doc_categories = [cat.attrib['code'] for cat in - root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] - - doc_date = root.attrib['date'] - doc_title = root.find('.//title').text - doc_headline = root.find('.//headline').text - doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) - - if not doc_body: - raise ValueError('Empty document') - - if doc_title is None: doc_title = '' - if doc_headline is None or doc_headline in doc_title: doc_headline = '' - text = '\n'.join([doc_title, doc_headline, doc_body]).strip() - - text_length = len(text.split()) - global nwords - nwords.append(text_length) - - return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) - - -def fetch_RCV1(data_path, split='all'): - - assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' - - request = [] - labels = set() - read_documents = 0 - lang = 'en' - - training_documents = 23149 - test_documents = 781265 - - if split == 'all': - split_range = (2286, 810596) - expected = training_documents+test_documents - elif split == 'train': - split_range = (2286, 26150) - expected = training_documents - else: - split_range = (26151, 810596) - expected = test_documents - - global nwords - nwords=[] - for part in list_files(data_path): - if not re.match('\d+\.zip', part): continue - target_file = join(data_path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) - labels.update(doc.categories) - request.append(doc) - read_documents += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents'.format(part, len(request)), end='') - if read_documents == expected: break - if read_documents == expected: break - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_RCV2(data_path, languages=None): - - if not languages: - languages = list(RCV2_LANG_DIR.keys()) - else: - assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' - - request = [] - labels = set() - global nwords - nwords=[] - for lang in languages: - path = join(data_path, RCV2_LANG_DIR[lang]) - lang_docs_read = 0 - for part in list_files(path): - target_file = join(path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang) - labels.update(doc.categories) - request.append(doc) - lang_docs_read += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_topic_hierarchy(path, topics='all'): - assert topics in ['all', 'leaves'] - - download_file_if_not_exists(RCV1_TOPICHIER_URL, path) - hierarchy = {} - for line in open(path, 'rt'): - parts = line.strip().split() - parent,child = parts[1],parts[3] - if parent not in hierarchy: - hierarchy[parent]=[] - hierarchy[parent].append(child) - - del hierarchy['None'] - del hierarchy['Root'] - print(hierarchy) - - if topics=='all': - topics = set(hierarchy.keys()) - for parent in hierarchy.keys(): - topics.update(hierarchy[parent]) - return list(topics) - elif topics=='leaves': - parents = set(hierarchy.keys()) - childs = set() - for parent in hierarchy.keys(): - childs.update(hierarchy[parent]) - return list(childs.difference(parents)) - - diff --git a/refactor/data/reader/wikipedia_tools.py b/refactor/data/reader/wikipedia_tools.py deleted file mode 100644 index 9558fb6..0000000 --- a/refactor/data/reader/wikipedia_tools.py +++ /dev/null @@ -1,307 +0,0 @@ -from __future__ import print_function - -# import ijson -# from ijson.common import ObjectBuilder -import os -import pickle -import re -from bz2 import BZ2File -from itertools import islice -from os.path import join -from xml.sax.saxutils import escape - -import numpy as np - -from util.file import list_dirs, list_files - -policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] - -""" -This file contains a set of tools for processing the Wikipedia multilingual documents. -In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) -and have processed each document to clean their texts with one of the tools: - - https://github.com/aesuli/wikipediatools (Python 2) - - https://github.com/aesuli/wikipedia-extractor (Python 3) -It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) - -This tools help you in: - - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. - Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" - extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). - Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". - - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. - - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all - language-specific versions from the document. - - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, - in a way that the i-th element from any list refers to the same element in the respective language. -""" - -def _doc_generator(text_path, langs): - dotspace = re.compile(r'\.(?!\s)') - for l,lang in enumerate(langs): - print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) - lang_dir = join(text_path, lang) - split_dirs = list_dirs(lang_dir) - for sd,split_dir in enumerate(split_dirs): - print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) - split_files = list_files(join(lang_dir, split_dir)) - for sf,split_file in enumerate(split_files): - print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) - with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: - while True: - doc_lines = list(islice(fi, 3)) - if doc_lines: - # some sentences are not followed by a space after the dot - doc_lines[1] = dotspace.sub('. ', doc_lines[1]) - # [workaround] I found   html symbol was not treated, and unescaping it now might not help... - doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) - yield doc_lines, lang - else: break - -def _extract_title(doc_lines): - m = re.search('title="(.+?)"', doc_lines[0]) - if m: return m.group(1).decode('utf-8') - else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) - -def _create_doc(target_file, id, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) - with open(target_file, 'w') as fo: - fo.write('\n'%id) - [fo.write(line) for line in doc] - fo.write('') - -def _append_doc(target_file, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) - with open(target_file, 'r', buffering=1024*1024) as fi: - lines = fi.readlines() - if doc[0] in lines[1::3]: - return - lines[-1:-1]=doc - with open(target_file, 'w', buffering=1024*1024) as fo: - [fo.write(line) for line in lines] - -def extract_multilingual_documents(inv_dict, langs, text_path, out_path): - if not os.path.exists(out_path): - os.makedirs(out_path) - for lang in langs: - if lang not in inv_dict: - raise ValueError("Lang %s is not in the dictionary" % lang) - - docs_created = len(list_files(out_path)) - print("%d multilingual documents found." % docs_created) - for doc,lang in _doc_generator(text_path, langs): - title = _extract_title(doc) - - if title in inv_dict[lang]: - #pass - ids = inv_dict[lang][title] - for id in ids: - target_file = join(out_path, id) + ".xml" - if os.path.exists(target_file): - _append_doc(target_file, doc, lang) - else: - _create_doc(target_file, id, doc, lang) - docs_created+=1 - else: - if not re.match('[A-Za-z]+', title): - print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) - - - -def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): - simplified_file = join(data_dir,filename) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy - pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") - pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") - if os.path.exists(pickle_invdict): - if return_both and os.path.exists(pickle_dict): - print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) - return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) - elif return_both==False: - print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) - return pickle.load(open(pickle_invdict, 'rb')) - - multiling_titles = {} - inv_dict = {lang:{} for lang in langs} - - def process_entry(line): - parts = line.strip().split('\t') - id = parts[0] - if id in multiling_titles: - raise ValueError("id <%s> already indexed" % id) - - titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) - for lang in titles.keys(): - if lang not in langs: - del titles[lang] - - if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ - or (policy == "IN_ANY_LANG" and len(titles) > 0): - multiling_titles[id] = titles - for lang, title in titles.items(): - if title in inv_dict[lang]: - inv_dict[lang][title].append(id) - inv_dict[lang][title] = [id] - - with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: - completed = 0 - try: - for line in fi: - process_entry(line) - completed += 1 - if completed % 10 == 0: - print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") - print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") - except EOFError: - print("\nUnexpected file ending... saving anyway") - - print("Pickling dictionaries in %s" % data_dir) - pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) - print("Done") - - return (multiling_titles, inv_dict) if return_both else inv_dict - - -# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 -def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): - latest_all_json_file = join(data_dir,json_file) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) - - def process_entry(last, fo): - global written - id = last["id"] - titles = None - if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): - titles = {lang: last["labels"][lang]["value"] for lang in langs} - elif policy == "IN_ANY_LANG": - titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} - - if titles: - fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) - return True - else: - return False - - written = 0 - with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ - BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: - builder = ObjectBuilder() - completed = 0 - for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): - builder.event(event, value) - if len(builder.value)>1: - if process_entry(builder.value.pop(0), fo): written += 1 - completed += 1 - print("\rCompleted %d\ttitles %d" % (completed,written), end="") - print("") - - #process the last entry - process_entry(builder.value.pop(0)) - - return simple_titles_path - -""" -Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the -specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- -specific version of the same document. Documents are forced to contain version in all specified languages and to contain -a minimum number of words; otherwise it is discarded. -""" -class MinWordsNotReached(Exception): pass -class WrongDocumentFormat(Exception): pass - -def _load_multilang_doc(path, langs, min_words=100): - import xml.etree.ElementTree as ET - from xml.etree.ElementTree import Element, ParseError - try: - root = ET.parse(path).getroot() - doc = {} - for lang in langs: - doc_body = root.find('.//doc[@lang="' + lang + '"]') - if isinstance(doc_body, Element): - n_words = len(doc_body.text.split(' ')) - if n_words >= min_words: - doc[lang] = doc_body.text - else: - raise MinWordsNotReached - else: - raise WrongDocumentFormat - except ParseError: - raise WrongDocumentFormat - return doc - -#returns the multilingual documents mapped by language, and a counter with the number of documents readed -def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): - if pickle_name and os.path.exists(pickle_name): - print("unpickling %s" % pickle_name) - return pickle.load(open(pickle_name, 'rb')) - - multi_docs = list_files(wiki_multi_path) - mling_documents = {l:[] for l in langs} - valid_documents = 0 - minwords_exception = 0 - wrongdoc_exception = 0 - for d,multi_doc in enumerate(multi_docs): - print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % - (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") - doc_path = join(wiki_multi_path, multi_doc) - try: - m_doc = _load_multilang_doc(doc_path, langs, min_words) - valid_documents += 1 - for l in langs: - mling_documents[l].append(m_doc[l]) - except MinWordsNotReached: - minwords_exception += 1 - if deletions: os.remove(doc_path) - except WrongDocumentFormat: - wrongdoc_exception += 1 - if deletions: os.remove(doc_path) - if max_documents>0 and valid_documents>=max_documents: - break - - if pickle_name: - print("Pickling wikipedia documents object in %s" % pickle_name) - pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - - return mling_documents - -def random_wiki_sample(l_wiki, max_documents): - if max_documents == 0: return None - langs = list(l_wiki.keys()) - assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' - ndocs_per_lang = len(l_wiki[langs[0]]) - if ndocs_per_lang > max_documents: - sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) - for lang in langs: - l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] - return l_wiki - - -if __name__ == "__main__": - - wikipedia_home = "../Datasets/Wikipedia" - - from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs - langs = frozenset(langs) - - simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") - _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') - extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), - out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) - - diff --git a/refactor/data/text_preprocessor.py b/refactor/data/text_preprocessor.py deleted file mode 100644 index fcfddba..0000000 --- a/refactor/data/text_preprocessor.py +++ /dev/null @@ -1,34 +0,0 @@ -from nltk import word_tokenize -from nltk.corpus import stopwords -from nltk.stem import SnowballStemmer - -from data.languages import NLTK_LANGMAP - - -def preprocess_documents(documents, lang): - tokens = NLTKStemTokenizer(lang, verbose=True) - sw = stopwords.words(NLTK_LANGMAP[lang]) - return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] - - -class NLTKStemTokenizer(object): - - def __init__(self, lang, verbose=False): - if lang not in NLTK_LANGMAP: - raise ValueError('Language %s is not supported in NLTK' % lang) - self.verbose=verbose - self.called = 0 - self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) - self.cache = {} - - def __call__(self, doc): - self.called += 1 - if self.verbose: - print("\r\t\t[documents processed %d]" % (self.called), end="") - tokens = word_tokenize(doc) - stems = [] - for t in tokens: - if t not in self.cache: - self.cache[t] = self.wnl.stem(t) - stems.append(self.cache[t]) - return stems \ No newline at end of file diff --git a/refactor/data/tsr_function__.py b/refactor/data/tsr_function__.py deleted file mode 100755 index c458029..0000000 --- a/refactor/data/tsr_function__.py +++ /dev/null @@ -1,271 +0,0 @@ -import math - -import numpy as np -from joblib import Parallel, delayed -from scipy.sparse import csr_matrix, csc_matrix -from scipy.stats import t - - -def get_probs(tpr, fpr, pc): - # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) - # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) - pnc = 1.0 - pc - tp = tpr * pc - fn = pc - tp - fp = fpr * pnc - tn = pnc - fp - return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) - - -def apply_tsr(tpr, fpr, pc, tsr): - cell = get_probs(tpr, fpr, pc) - return tsr(cell) - - -def positive_information_gain(cell): - if cell.tpr() < cell.fpr(): - return 0.0 - else: - return information_gain(cell) - - -def posneg_information_gain(cell): - ig = information_gain(cell) - if cell.tpr() < cell.fpr(): - return -ig - else: - return ig - - -def __ig_factor(p_tc, p_t, p_c): - den = p_t * p_c - if den != 0.0 and p_tc != 0: - return p_tc * math.log(p_tc / den, 2) - else: - return 0.0 - - -def information_gain(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ - __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ - __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ - __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) - - -def information_gain_mod(cell): - return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ - - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) - - -def pointwise_mutual_information(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) - - -def gain_ratio(cell): - pc = cell.p_c() - pnc = 1.0 - pc - norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) - return information_gain(cell) / (-norm) - - -def chi_square(cell): - den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() - if den==0.0: return 0.0 - num = gss(cell)**2 - return num / den - - -def relevance_frequency(cell): - a = cell.tp - c = cell.fp - if c == 0: c = 1 - return math.log(2.0 + (a * 1.0 / c), 2) - - -def idf(cell): - if cell.p_f()>0: - return math.log(1.0 / cell.p_f()) - return 0.0 - - -def gss(cell): - return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() - - -def conf_interval(xt, n): - if n>30: - z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 - else: - z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 - p = (xt + 0.5 * z2) / (n + z2) - amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) - return p, amplitude - -def strength(minPosRelFreq, minPos, maxNeg): - if minPos > maxNeg: - return math.log(2.0 * minPosRelFreq, 2.0) - else: - return 0.0 - - -#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) -#however, for some extremely imbalanced dataset caused all documents to be 0 -def conf_weight(cell, cancel_features=False): - c = cell.get_c() - not_c = cell.get_not_c() - tp = cell.tp - fp = cell.fp - - pos_p, pos_amp = conf_interval(tp, c) - neg_p, neg_amp = conf_interval(fp, not_c) - - min_pos = pos_p-pos_amp - max_neg = neg_p+neg_amp - den = (min_pos + max_neg) - minpos_relfreq = min_pos / (den if den != 0 else 1) - - str_tplus = strength(minpos_relfreq, min_pos, max_neg); - - if str_tplus == 0 and not cancel_features: - return 1e-20 - - return str_tplus; - - -class ContTable: - - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - -def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): - print(f'[selectiong {k} terms]') - nC = Y.shape[1] - FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T - best_features_idx = np.argsort(-FC, axis=0).flatten() - tsr_values = FC.flatten() - selected_indexes_set = set() - selected_indexes = list() - selected_value = list() - from_category = list() - round_robin = iter(best_features_idx) - values_iter = iter(tsr_values) - round=0 - while len(selected_indexes) < k: - term_idx = next(round_robin) - term_val = next(values_iter) - if term_idx not in selected_indexes_set: - selected_indexes_set.add(term_idx) - selected_indexes.append(term_idx) - selected_value.append(term_val) - from_category.append(round) - round = (round + 1) % nC - return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) - - -def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): - tp_ = len(positive_document_indexes & feature_document_indexes) - fp_ = len(feature_document_indexes - positive_document_indexes) - fn_ = len(positive_document_indexes - feature_document_indexes) - tn_ = nD - (tp_ + fp_ + fn_) - return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) - - -def category_tables(feature_sets, category_sets, c, nD, nF): - return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] - - -""" -Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. -Efficiency O(nF x nC x log(S)) where S is the sparse factor -""" -def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): - nD, nF = coocurrence_matrix.shape - nD2, nC = label_matrix.shape - - if nD != nD2: - raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % - (coocurrence_matrix.shape,label_matrix.shape)) - - def nonzero_set(matrix, col): - return set(matrix[:, col].nonzero()[0]) - - if isinstance(coocurrence_matrix, csr_matrix): - coocurrence_matrix = csc_matrix(coocurrence_matrix) - feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] - category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] - cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) - return np.array(cell_matrix) - -# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f -def get_tsr_matrix(cell_matrix, tsr_score_funtion): - nC,nF = cell_matrix.shape - tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] - return np.array(tsr_matrix) - - -""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can -take as input any real-valued feature column (e.g., tf-idf weights). -feat is the feature vector, and c is a binary classification vector. -This implementation covers only the binary case, while the formula is defined for multiclass -single-label scenarios, for which the version [2] might be preferred. -[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. -[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. -""" -def fisher_score_binary(feat, c): - neg = np.ones_like(c) - c - - npos = np.sum(c) - nneg = np.sum(neg) - - mupos = np.mean(feat[c == 1]) - muneg = np.mean(feat[neg == 1]) - mu = np.mean(feat) - - stdpos = np.std(feat[c == 1]) - stdneg = np.std(feat[neg == 1]) - - num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) - den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) - - if den>0: - return num / den - else: - return num diff --git a/refactor/funnelling.py b/refactor/funnelling.py deleted file mode 100644 index 812a937..0000000 --- a/refactor/funnelling.py +++ /dev/null @@ -1,124 +0,0 @@ -from models.learners import * -from util.common import _normalize -from view_generators import VanillaFunGen - - -class DocEmbedderList: - """ - Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be - contained by this class in order to seamlessly train the overall architecture. - """ - def __init__(self, embedder_list, probabilistic=True): - """ - Init the DocEmbedderList. - :param embedder_list: list of embedders to be deployed - :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not - """ - assert len(embedder_list) != 0, 'Embedder list cannot be empty!' - self.embedders = embedder_list - self.probabilistic = probabilistic - if probabilistic: - _tmp = [] - for embedder in self.embedders: - if isinstance(embedder, VanillaFunGen): - _tmp.append(embedder) - else: - _tmp.append(FeatureSet2Posteriors(embedder)) - self.embedders = _tmp - - def fit(self, lX, ly): - """ - Fit all the ViewGenerators contained by DocEmbedderList. - :param lX: - :param ly: - :return: self - """ - for embedder in self.embedders: - embedder.fit(lX, ly) - return self - - def transform(self, lX): - """ - Project documents by means of every ViewGenerators. Projections are then averaged together and returned. - :param lX: - :return: common latent space (averaged). - """ - langs = sorted(lX.keys()) - lZparts = {lang: None for lang in langs} - - for embedder in self.embedders: - lZ = embedder.transform(lX) - for lang in langs: - Z = lZ[lang] - if lZparts[lang] is None: - lZparts[lang] = Z - else: - lZparts[lang] += Z - n_embedders = len(self.embedders) - return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class FeatureSet2Posteriors: - """ - Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of - a multiclass SVM. - """ - def __init__(self, embedder, l2=True, n_jobs=-1): - """ - Init the class. - :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities. - :param l2: bool, whether to apply or not L2 normalization to the projection - :param n_jobs: int, number of concurrent workers. - """ - self.embedder = embedder - self.l2 = l2 - self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) - - def fit(self, lX, ly): - lZ = self.embedder.fit_transform(lX, ly) - self.prob_classifier.fit(lZ, ly) - return self - - def transform(self, lX): - lP = self.predict_proba(lX) - lP = _normalize(lP, self.l2) - return lP - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - def predict(self, lX): - lZ = self.embedder.transform(lX) - return self.prob_classifier.predict(lZ) - - def predict_proba(self, lX): - lZ = self.embedder.transform(lX) - return self.prob_classifier.predict_proba(lZ) - - -class Funnelling: - """ - Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders. - The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by - the first-tier learners. - """ - def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): - self.first_tier = first_tier - self.meta = meta_classifier - self.n_jobs = n_jobs - - def fit(self, lX, ly): - print('## Fitting first-tier learners!') - lZ = self.first_tier.fit_transform(lX, ly) - print('## Fitting meta-learner!') - self.meta.fit(lZ, ly) - - def predict(self, lX): - lZ = self.first_tier.transform(lX) - ly = self.meta.predict(lZ) - return ly diff --git a/refactor/main.py b/refactor/main.py deleted file mode 100644 index ebc43a3..0000000 --- a/refactor/main.py +++ /dev/null @@ -1,167 +0,0 @@ -from argparse import ArgumentParser - -from data.dataset_builder import MultilingualDataset -from funnelling import * -from util.common import MultilingualIndex, get_params, get_method_name -from util.evaluation import evaluate -from util.results_csv import CSVlog -from view_generators import * - - -def main(args): - assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \ - 'empty set of document embeddings is not allowed!' - - print('Running generalized funnelling...') - - data = MultilingualDataset.load(args.dataset) - data.set_view(languages=['it', 'fr']) - data.show_dimensions() - lX, ly = data.training() - lXte, lyte = data.test() - - # Init multilingualIndex - mandatory when deploying Neural View Generators... - if args.gru_embedder or args.bert_embedder: - multilingualIndex = MultilingualIndex() - lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) - multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) - - # Init ViewGenerators and append them to embedder_list - embedder_list = [] - if args.post_embedder: - posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) - embedder_list.append(posteriorEmbedder) - - if args.muse_embedder: - museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) - embedder_list.append(museEmbedder) - - if args.wce_embedder: - wceEmbedder = WordClassGen(n_jobs=args.n_jobs) - embedder_list.append(wceEmbedder) - - if args.gru_embedder: - rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, - nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs) - embedder_list.append(rnnEmbedder) - - if args.bert_embedder: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) - embedder_list.append(bertEmbedder) - - # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier - docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), - meta_parameters=get_params(optimc=args.optimc)) - - # Init Funnelling Architecture - gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) - - # Training --------------------------------------- - print('\n[Training Generalized Funnelling]') - time_init = time() - time_tr = time() - gfun.fit(lX, ly) - time_tr = round(time() - time_tr, 3) - print(f'Training completed in {time_tr} seconds!') - - # Testing ---------------------------------------- - print('\n[Testing Generalized Funnelling]') - time_te = time() - ly_ = gfun.predict(lXte) - l_eval = evaluate(ly_true=lyte, ly_pred=ly_) - time_te = round(time() - time_te, 3) - print(f'Testing completed in {time_te} seconds!') - - # Logging --------------------------------------- - print('\n[Results]') - results = CSVlog(args.csv_dir) - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') - if results is not None: - _id, _dataset = get_method_name(args) - results.add_row(method='gfun', - setting=_id, - optimc=args.optimc, - sif='True', - zscore='True', - l2='True', - dataset=_dataset, - time_tr=time_tr, - time_te=time_te, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) - - overall_time = round(time() - time_init, 3) - exit(f'\nExecuted in: {overall_time} seconds!') - - -if __name__ == '__main__': - parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani') - - parser.add_argument('dataset', help='Path to the dataset') - - parser.add_argument('-o', '--output', dest='csv_dir', - help='Result file (default ../csv_log/gfun_results.csv)', type=str, - default='csv_logs/gfun/gfun_results.csv') - - parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', - help='deploy posterior probabilities embedder to compute document embeddings', - default=False) - - parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true', - help='deploy (supervised) Word-Class embedder to the compute document embeddings', - default=False) - - parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true', - help='deploy (pretrained) MUSE embedder to compute document embeddings', - default=False) - - parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true', - help='deploy multilingual Bert to compute document embeddings', - default=False) - - parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', - help='deploy a GRU in order to compute document embeddings', - default=False) - - parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', - help='Optimize SVMs C hyperparameter', - default=False) - - parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, - help='Number of max epochs to train Recurrent embedder (i.e., -g)') - - parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, - help='Number of parallel jobs (default is -1, all)', - default=-1) - - parser.add_argument('--muse_dir', dest='muse_dir', type=str, - help='Path to the MUSE polylingual word embeddings (default ../embeddings)', - default='../embeddings') - - parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', - help='Deploy WCE embedding as embedding layer of the GRU View Generator', - default=False) - - parser.add_argument('--gru_dir', dest='gru_dir', type=str, - help='Set the path to a pretrained GRU model (i.e., -g view generator)', - default=None) - - parser.add_argument('--bert_dir', dest='bert_dir', type=str, - help='Set the path to a pretrained mBERT model (i.e., -b view generator)', - default=None) - - parser.add_argument('--gpus', help='specifies how many GPUs to use per node', - default=None) - - args = parser.parse_args() - main(args) diff --git a/refactor/models/helpers.py b/refactor/models/helpers.py deleted file mode 100755 index b466f28..0000000 --- a/refactor/models/helpers.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -import torch.nn as nn -from torch.nn import functional as F - - -def init_embeddings(pretrained, vocab_size, learnable_length): - """ - Compute the embedding matrix - :param pretrained: - :param vocab_size: - :param learnable_length: - :return: - """ - pretrained_embeddings = None - pretrained_length = 0 - if pretrained is not None: - pretrained_length = pretrained.shape[1] - assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' - pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) - # requires_grad=False sets the embedding layer as NOT trainable - pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - - learnable_embeddings = None - if learnable_length > 0: - learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - - embedding_length = learnable_length + pretrained_length - assert embedding_length > 0, '0-size embeddings' - return pretrained_embeddings, learnable_embeddings, embedding_length - - -def embed(model, input, lang): - input_list = [] - if model.lpretrained_embeddings[lang]: - input_list.append(model.lpretrained_embeddings[lang](input)) - if model.llearnable_embeddings[lang]: - input_list.append(model.llearnable_embeddings[lang](input)) - return torch.cat(tensors=input_list, dim=2) - - -def embedding_dropout(input, drop_range, p_drop=0.5, training=True): - if p_drop > 0 and training and drop_range is not None: - p = p_drop - drop_from, drop_to = drop_range - m = drop_to - drop_from #length of the supervised embedding - l = input.shape[2] #total embedding length - corr = (1 - p) - input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) - input /= (1 - (p * m / l)) - - return input diff --git a/refactor/models/learners.py b/refactor/models/learners.py deleted file mode 100644 index 2654109..0000000 --- a/refactor/models/learners.py +++ /dev/null @@ -1,224 +0,0 @@ -import time - -import numpy as np -from joblib import Parallel, delayed -from scipy.sparse import issparse -from sklearn.model_selection import GridSearchCV -from sklearn.multiclass import OneVsRestClassifier -from sklearn.svm import SVC - -from util.standardizer import StandardizeTransformer - - -def get_learner(calibrate=False, kernel='linear', C=1): - """ - instantiate scikit Support Vector Classifier - :param calibrate: boolean, whether to return posterior probabilities or not - :param kernel: string,kernel to be applied to the SVC - :param C: int or dict {'C': list of integer}, Regularization parameter - :return: Support Vector Classifier - """ - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) - - -def _sort_if_sparse(X): - if issparse(X) and not X.has_sorted_indices: - X.sort_indices() - - -def _joblib_transform_multiling(transformer, lX, n_jobs=-1): - if n_jobs == 1: - return {lang: transformer(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) - return {lang: transformations[i] for i, lang in enumerate(langs)} - - -class TrivialRejector: - def fit(self, X, y): - self.cats = y.shape[1] - return self - - def decision_function(self, X): return np.zeros((X.shape[0], self.cats)) - - def predict(self, X): return np.zeros((X.shape[0], self.cats)) - - def predict_proba(self, X): return np.zeros((X.shape[0], self.cats)) - - def best_params(self): return {} - - -class NaivePolylingualClassifier: - """ - Is a mere set of independet MonolingualClassifiers - """ - - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.base_learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - - def fit(self, lX, ly): - """ - trains the independent monolingual classifiers - :param lX: a dictionary {language_label: X csr-matrix} - :param ly: a dictionary {language_label: y np.array} - :return: self - """ - tinit = time.time() - assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' - langs = list(lX.keys()) - for lang in langs: - _sort_if_sparse(lX[lang]) - - models = Parallel(n_jobs=self.n_jobs)\ - (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for - lang in langs) - - self.model = {lang: models[i] for i, lang in enumerate(langs)} - self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs} - self.time = time.time() - tinit - return self - - def decision_function(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of classification scores for each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def predict_proba(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of probabilities that each document belongs to each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( - delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def predict(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of predictions - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' - if self.n_jobs == 1: - return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def best_params(self): - return {lang: model.best_params() for lang, model in self.model.items()} - - -class MonolingualClassifier: - - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - self.best_params_ = None - - def fit(self, X, y): - if X.shape[0] == 0: - print('Warning: X has 0 elements, a trivial rejector will be created') - self.model = TrivialRejector().fit(X, y) - self.empty_categories = np.arange(y.shape[1]) - return self - - tinit = time.time() - _sort_if_sparse(X) - self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten() - # multi-class format - if len(y.shape) == 2: - if self.parameters is not None: - self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} - for params in self.parameters] - self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) - else: - self.model = self.learner - raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' - 'the labels across languages') - - # parameter optimization? - if self.parameters: - print('debug: optimizing parameters:', self.parameters) - self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, - error_score=0, verbose=10) - - print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') - self.model.fit(X, y) - if isinstance(self.model, GridSearchCV): - self.best_params_ = self.model.best_params_ - print('best parameters: ', self.best_params_) - self.time = time.time() - tinit - return self - - def decision_function(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.decision_function(X) - - def predict_proba(self, X): - assert self.model is not None, 'predict called before fit' - assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' - _sort_if_sparse(X) - return self.model.predict_proba(X) - - def predict(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.predict(X) - - def best_params(self): - return self.best_params_ - - -class MetaClassifier: - - def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): - self.n_jobs = n_jobs - self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) - self.standardize_range = standardize_range - - def fit(self, lZ, ly): - tinit = time.time() - Z, y = self.stack(lZ, ly) - - self.standardizer = StandardizeTransformer(range=self.standardize_range) - Z = self.standardizer.fit_transform(Z) - - print('fitting the Z-space of shape={}'.format(Z.shape)) - self.model.fit(Z, y) - self.time = time.time() - tinit - - def stack(self, lZ, ly=None): - langs = list(lZ.keys()) - Z = np.vstack([lZ[lang] for lang in langs]) - if ly is not None: - y = np.vstack([ly[lang] for lang in langs]) - return Z, y - else: - return Z - - def predict(self, lZ): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) - - def predict_proba(self, lZ): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) - diff --git a/refactor/models/lstm_class.py b/refactor/models/lstm_class.py deleted file mode 100755 index 7f2cf59..0000000 --- a/refactor/models/lstm_class.py +++ /dev/null @@ -1,113 +0,0 @@ -#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -from torch.autograd import Variable - -from models.helpers import * - - -class RNNMultilingualClassifier(nn.Module): - - def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, - drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, - bert_embeddings=False): - - super(RNNMultilingualClassifier, self).__init__() - self.output_size = output_size - self.hidden_size = hidden_size - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - self.post_probabilities = post_probabilities - self.bert_embeddings = bert_embeddings - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - - self.lpretrained_embeddings = nn.ModuleDict() - self.llearnable_embeddings = nn.ModuleDict() - self.embedding_length = None - self.langs = sorted(lvocab_size.keys()) - self.only_post = only_post - - self.n_layers = 1 - self.n_directions = 1 - - self.dropout = nn.Dropout(0.6) - - lstm_out = 256 - ff1 = 512 - ff2 = 256 - - lpretrained_embeddings = {} - llearnable_embeddings = {} - if only_post==False: - for l in self.langs: - pretrained = lpretrained[l] if lpretrained else None - pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( - pretrained, lvocab_size[l], learnable_length - ) - lpretrained_embeddings[l] = pretrained_embeddings - llearnable_embeddings[l] = learnable_embeddings - self.embedding_length = embedding_length - - # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) - self.rnn = nn.GRU(self.embedding_length, hidden_size) - self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) - self.lpretrained_embeddings.update(lpretrained_embeddings) - self.llearnable_embeddings.update(llearnable_embeddings) - - self.linear1 = nn.Linear(lstm_out, ff1) - self.linear2 = nn.Linear(ff1, ff2) - - if only_post: - self.label = nn.Linear(output_size, output_size) - elif post_probabilities and not bert_embeddings: - self.label = nn.Linear(ff2 + output_size, output_size) - elif bert_embeddings and not post_probabilities: - self.label = nn.Linear(ff2 + 768, output_size) - elif post_probabilities and bert_embeddings: - self.label = nn.Linear(ff2 + output_size + 768, output_size) - else: - self.label = nn.Linear(ff2, output_size) - - def forward(self, input, post, bert_embed, lang): - if self.only_post: - doc_embedding = post - else: - doc_embedding = self.transform(input, lang) - if self.post_probabilities: - doc_embedding = torch.cat([doc_embedding, post], dim=1) - if self.bert_embeddings: - doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) - - logits = self.label(doc_embedding) - return logits - - def transform(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # output, (_, _) = self.lstm(input, (h_0, c_0)) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - output = self.dropout(F.relu(self.linear2(output))) - return output - - def finetune_pretrained(self): - for l in self.langs: - self.lpretrained_embeddings[l].requires_grad = True - self.lpretrained_embeddings[l].weight.requires_grad = True - - def get_embeddings(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - return output.cpu().detach().numpy() - diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py deleted file mode 100644 index afb28b5..0000000 --- a/refactor/models/pl_bert.py +++ /dev/null @@ -1,183 +0,0 @@ -import pytorch_lightning as pl -import torch -from torch.optim.lr_scheduler import StepLR -from transformers import BertForSequenceClassification, AdamW - -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK - - -class BertModel(pl.LightningModule): - - def __init__(self, output_size, stored_path, gpus=None): - """ - Init Bert model. - :param output_size: - :param stored_path: - :param gpus: - """ - super().__init__() - self.loss = torch.nn.BCEWithLogitsLoss() - self.gpus = gpus - self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) - self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics to compute metrics at epoch level - self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - - if stored_path: - self.bert = BertForSequenceClassification.from_pretrained(stored_path, - num_labels=output_size, - output_hidden_states=True) - else: - self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', - num_labels=output_size, - output_hidden_states=True) - self.save_hyperparameters() - - def forward(self, X): - logits = self.bert(X) - return logits - - def training_step(self, train_batch, batch_idx): - X, y, _, batch_langs = train_batch - X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.FloatTensor) - y = y.to('cuda' if self.gpus else 'cpu') - logits, _ = self.forward(X) - loss = self.loss(logits, y) - # Squashing logits through Sigmoid in order to get confidence score - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - lX, ly = self._reconstruct_dict(predictions, y, batch_langs) - return {'loss': loss, 'pred': lX, 'target': ly} - - def training_epoch_end(self, outputs): - langs = [] - for output in outputs: - langs.extend(list(output['pred'].keys())) - langs = set(langs) - # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. - # here we save epoch level metric values and compute them specifically for each language - res_macroF1 = {lang: [] for lang in langs} - res_microF1 = {lang: [] for lang in langs} - res_macroK = {lang: [] for lang in langs} - res_microK = {lang: [] for lang in langs} - for output in outputs: - lX, ly = output['pred'], output['target'] - for lang in lX.keys(): - X, y = lX[lang], ly[lang] - lang_macroF1 = self.lang_macroF1(X, y) - lang_microF1 = self.lang_microF1(X, y) - lang_macroK = self.lang_macroK(X, y) - lang_microK = self.lang_microK(X, y) - - res_macroF1[lang].append(lang_macroF1) - res_microF1[lang].append(lang_microF1) - res_macroK[lang].append(lang_macroK) - res_microK[lang].append(lang_microK) - for lang in langs: - avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) - avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) - avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) - avg_microK = torch.mean(torch.Tensor(res_microK[lang])) - self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) - - def validation_step(self, val_batch, batch_idx): - X, y, _, batch_langs = val_batch - X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.FloatTensor) - y = y.to('cuda' if self.gpus else 'cpu') - logits, _ = self.forward(X) - loss = self.loss(logits, y) - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def test_step(self, test_batch, batch_idx): - X, y, _, batch_langs = test_batch - X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.FloatTensor) - y = y.to('cuda' if self.gpus else 'cpu') - logits, _ = self.forward(X) - loss = self.loss(logits, y) - # Squashing logits through Sigmoid in order to get confidence score - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return - - def configure_optimizers(self, lr=3e-5, weight_decay=0.01): - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in self.bert.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay}, - {'params': [p for n, p in self.bert.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - scheduler = StepLR(optimizer, step_size=25, gamma=0.1) - return [optimizer], [scheduler] - - def encode(self, lX, batch_size=64): - with torch.no_grad(): - l_embed = {lang: [] for lang in lX.keys()} - for lang in sorted(lX.keys()): - for i in range(0, len(lX[lang]), batch_size): - if i + batch_size > len(lX[lang]): - batch = lX[lang][i:len(lX[lang])] - else: - batch = lX[lang][i:i + batch_size] - max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) - batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') - _, output = self.forward(batch) - doc_embeds = output[-1][:, 0, :] - l_embed[lang].append(doc_embeds.cpu()) - for k, v in l_embed.items(): - l_embed[k] = torch.cat(v, dim=0).numpy() - return l_embed - - @staticmethod - def _reconstruct_dict(predictions, y, batch_langs): - reconstructed_x = {lang: [] for lang in set(batch_langs)} - reconstructed_y = {lang: [] for lang in set(batch_langs)} - for i, pred in enumerate(predictions): - reconstructed_x[batch_langs[i]].append(pred) - reconstructed_y[batch_langs[i]].append(y[i]) - for k, v in reconstructed_x.items(): - reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) - for k, v in reconstructed_y.items(): - reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) - return reconstructed_x, reconstructed_y diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py deleted file mode 100644 index afb12e6..0000000 --- a/refactor/models/pl_gru.py +++ /dev/null @@ -1,266 +0,0 @@ -# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html -import pytorch_lightning as pl -import torch -import torch.nn.functional as F -from torch import nn -from torch.autograd import Variable -from torch.optim.lr_scheduler import StepLR -from transformers import AdamW - -from models.helpers import init_embeddings -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK - - -class RecurrentModel(pl.LightningModule): - def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, - drop_embedding_range, drop_embedding_prop, gpus=None): - """ - Init RNN model. - :param lPretrained: - :param langs: - :param output_size: - :param hidden_size: - :param lVocab_size: - :param learnable_length: - :param drop_embedding_range: - :param drop_embedding_prop: - :param gpus: - """ - super().__init__() - self.gpus = gpus - self.langs = langs - self.lVocab_size = lVocab_size - self.learnable_length = learnable_length - self.output_size = output_size - self.hidden_size = hidden_size - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - self.loss = torch.nn.BCEWithLogitsLoss() - - self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) - self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics to compute metrics at epoch level - self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - - self.lPretrained_embeddings = nn.ModuleDict() - self.lLearnable_embeddings = nn.ModuleDict() - - self.n_layers = 1 - self.n_directions = 1 - self.dropout = nn.Dropout(0.6) - - lstm_out = 256 - ff1 = 512 - ff2 = 256 - - lpretrained_embeddings = {} - llearnable_embeddings = {} - - for lang in self.langs: - pretrained = lPretrained[lang] if lPretrained else None - pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( - pretrained, self.lVocab_size[lang], self.learnable_length) - lpretrained_embeddings[lang] = pretrained_embeddings - llearnable_embeddings[lang] = learnable_embeddings - self.embedding_length = embedding_length - - self.lPretrained_embeddings.update(lpretrained_embeddings) - self.lLearnable_embeddings.update(llearnable_embeddings) - - self.rnn = nn.GRU(self.embedding_length, hidden_size) - self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) - self.linear1 = nn.Linear(lstm_out, ff1) - self.linear2 = nn.Linear(ff1, ff2) - self.label = nn.Linear(ff2, self.output_size) - - # TODO: setting lPretrained to None, letting it to its original value will "bug" first validation - # step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) - lPretrained = None - self.save_hyperparameters() - - def forward(self, lX): - l_embed = [] - for lang in sorted(lX.keys()): - doc_embedding = self.transform(lX[lang], lang) - l_embed.append(doc_embedding) - embed = torch.cat(l_embed, dim=0) - logits = self.label(embed) - return logits - - def transform(self, X, lang): - batch_size = X.shape[0] - X = self.embed(X, lang) - X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - X = X.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device)) - output, _ = self.rnn(X, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - output = self.dropout(F.relu(self.linear2(output))) - return output - - def encode(self, lX, l_pad, batch_size=128): - """ - Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. - :param lX: - :param l_pad: - :param batch_size: - :return: - """ - with torch.no_grad(): - l_embed = {lang: [] for lang in lX.keys()} - for lang in sorted(lX.keys()): - for i in range(0, len(lX[lang]), batch_size): - if i+batch_size > len(lX[lang]): - batch = lX[lang][i:len(lX[lang])] - else: - batch = lX[lang][i:i+batch_size] - max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) - X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') - _batch_size = X.shape[0] - X = self.embed(X, lang) - X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - X = X.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) - output, _ = self.rnn(X, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - l_embed[lang].append(output.cpu()) - for k, v in l_embed.items(): - l_embed[k] = torch.cat(v, dim=0).numpy() - return l_embed - - def training_step(self, train_batch, batch_idx): - lX, ly = train_batch - logits = self.forward(lX) - _ly = [] - for lang in sorted(lX.keys()): - _ly.append(ly[lang]) - y = torch.cat(_ly, dim=0) - loss = self.loss(logits, y) - # Squashing logits through Sigmoid in order to get confidence score - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - re_lX = self._reconstruct_dict(predictions, ly) - return {'loss': loss, 'pred': re_lX, 'target': ly} - - def training_epoch_end(self, outputs): - # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. - # here we save epoch level metric values and compute them specifically for each language - res_macroF1 = {lang: [] for lang in self.langs} - res_microF1 = {lang: [] for lang in self.langs} - res_macroK = {lang: [] for lang in self.langs} - res_microK = {lang: [] for lang in self.langs} - for output in outputs: - lX, ly = output['pred'], output['target'] - for lang in lX.keys(): - X, y = lX[lang], ly[lang] - lang_macroF1 = self.lang_macroF1(X, y) - lang_microF1 = self.lang_microF1(X, y) - lang_macroK = self.lang_macroK(X, y) - lang_microK = self.lang_microK(X, y) - - res_macroF1[lang].append(lang_macroF1) - res_microF1[lang].append(lang_microF1) - res_macroK[lang].append(lang_macroK) - res_microK[lang].append(lang_microK) - for lang in self.langs: - avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) - avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) - avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) - avg_microK = torch.mean(torch.Tensor(res_microK[lang])) - self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) - - def validation_step(self, val_batch, batch_idx): - lX, ly = val_batch - logits = self.forward(lX) - _ly = [] - for lang in sorted(lX.keys()): - _ly.append(ly[lang]) - ly = torch.cat(_ly, dim=0) - loss = self.loss(logits, ly) - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, ly) - macroF1 = self.macroF1(predictions, ly) - microK = self.microK(predictions, ly) - macroK = self.macroK(predictions, ly) - self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def test_step(self, test_batch, batch_idx): - lX, ly = test_batch - logits = self.forward(lX) - _ly = [] - for lang in sorted(lX.keys()): - _ly.append(ly[lang]) - ly = torch.cat(_ly, dim=0) - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, ly) - macroF1 = self.macroF1(predictions, ly) - microK = self.microK(predictions, ly) - macroK = self.macroK(predictions, ly) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return - - def embed(self, X, lang): - input_list = [] - if self.lPretrained_embeddings[lang]: - input_list.append(self.lPretrained_embeddings[lang](X)) - if self.lLearnable_embeddings[lang]: - input_list.append(self.lLearnable_embeddings[lang](X)) - return torch.cat(tensors=input_list, dim=2) - - def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True): - if p_drop > 0 and training and drop_range is not None: - p = p_drop - drop_from, drop_to = drop_range - m = drop_to - drop_from # length of the supervised embedding - l = X.shape[2] # total embedding length - corr = (1 - p) - X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p) - X /= (1 - (p * m / l)) - return X - - def configure_optimizers(self): - optimizer = AdamW(self.parameters(), lr=1e-3) - scheduler = StepLR(optimizer, step_size=25, gamma=0.5) - return [optimizer], [scheduler] - - @staticmethod - def _reconstruct_dict(X, ly): - reconstructed = {} - _start = 0 - for lang in sorted(ly.keys()): - lang_batchsize = len(ly[lang]) - reconstructed[lang] = X[_start:_start+lang_batchsize] - _start += lang_batchsize - return reconstructed diff --git a/refactor/requirements.txt b/refactor/requirements.txt deleted file mode 100644 index 4546a4a..0000000 --- a/refactor/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -transformers==2.11.0 -pandas==0.25.3 -numpy==1.17.4 -joblib==0.14.0 -tqdm==4.50.2 -pytorch_lightning==1.1.2 -torch==1.3.1 -nltk==3.4.5 -scipy==1.3.3 -rdflib==4.2.2 -torchtext==0.4.0 -scikit_learn==0.24.1 diff --git a/refactor/run.sh b/refactor/run.sh deleted file mode 100644 index 04365f9..0000000 --- a/refactor/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -for i in {0..10..1} -do - python main.py --gpus 0 -done \ No newline at end of file diff --git a/refactor/util/SIF_embed.py b/refactor/util/SIF_embed.py deleted file mode 100644 index 4a3d712..0000000 --- a/refactor/util/SIF_embed.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -from sklearn.decomposition import TruncatedSVD - - -def get_weighted_average(We, x, w): - """ - Compute the weighted average vectors - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in sentence i - :param w: w[i, :] are the weights for the words in sentence i - :return: emb[i, :] are the weighted average vector for sentence i - """ - n_samples = x.shape[0] - emb = np.zeros((n_samples, We.shape[1])) - for i in range(n_samples): - emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) - return emb - - -def compute_pc(X,npc=1): - """ - Compute the principal components. - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: component_[i,:] is the i-th pc - """ - svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) - svd.fit(X) - return svd.components_ - - -def remove_pc(X, npc=1): - """ - Remove the projection on the principal components - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: XX[i, :] is the data point after removing its projection - """ - pc = compute_pc(X, npc) - if npc == 1: - XX = X - X.dot(pc.transpose()) * pc - else: - XX = X - X.dot(pc.transpose()).dot(pc) - return XX - - -def SIF_embedding(We, x, w, params): - """ - Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in the i-th sentence - :param w: w[i, :] are the weights for the words in the i-th sentence - :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component - :return: emb, emb[i, :] is the embedding for sentence i - """ - emb = get_weighted_average(We, x, w) - if params.rmpc > 0: - emb = remove_pc(emb, params.rmpc) - return emb \ No newline at end of file diff --git a/refactor/util/common.py b/refactor/util/common.py deleted file mode 100644 index 61ac52f..0000000 --- a/refactor/util/common.py +++ /dev/null @@ -1,384 +0,0 @@ -import numpy as np -import torch -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import normalize - -from util.embeddings_manager import supervised_embeddings_tfidf - - -class TfidfVectorizerMultilingual: - - def __init__(self, **kwargs): - self.kwargs = kwargs - - def fit(self, lX, ly=None): - self.langs = sorted(lX.keys()) - self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} - return self - - def transform(self, lX): - return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} - - def fit_transform(self, lX, ly=None): - return self.fit(lX, ly).transform(lX) - - def vocabulary(self, l=None): - if l is None: - return {l: self.vectorizer[l].vocabulary_ for l in self.langs} - else: - return self.vectorizer[l].vocabulary_ - - def get_analyzer(self, l=None): - if l is None: - return {l: self.vectorizer[l].build_analyzer() for l in self.langs} - else: - return self.vectorizer[l].build_analyzer() - - -def _normalize(lX, l2=True): - return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX - - -def none_dict(langs): - return {l: None for l in langs} - - -class MultilingualIndex: - def __init__(self): - """ - Class that contains monolingual Indexes - """ - self.l_index = {} - self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None): - self.langs = sorted(l_devel_raw.keys()) - self.l_vectorizer.fit(l_devel_raw) - l_vocabulary = self.l_vectorizer.vocabulary() - l_analyzer = self.l_vectorizer.get_analyzer() - if l_pretrained_vocabulary is None: - l_pretrained_vocabulary = none_dict(self.langs) - - for lang in self.langs: - # Init monolingual Index - self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], - lang) - # call to index() function of monolingual Index - self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) - - def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l, index in self.l_index.items(): - index.train_val_split(val_prop, max_val, seed=seed) - - def embedding_matrices(self, lpretrained, supervised): - """ - Extract from pretrained embeddings words that are found in the training dataset, then for each language - calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated - to the unsupervised vectors). - :param lpretrained: dict {lang : matrix of word-embeddings } - :param supervised: bool, whether to deploy Word-Class Embeddings or not - :return: self - """ - lXtr = self.get_lXtr() if supervised else none_dict(self.langs) - lYtr = self.l_train_target() if supervised else none_dict(self.langs) - lWordList = self.get_wordlist() - lExtracted = lpretrained.extract(lWordList) - for lang, index in self.l_index.items(): - # if supervised concatenate embedding matrices of pretrained unsupervised - # and supervised word-class embeddings - index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang]) - self.sup_range = index.wce_range - return self - - def get_wordlist(self): - wordlist = {} - for lang, index in self.l_index.items(): - wordlist[lang] = index.get_word_list() - return wordlist - - def get_raw_lXtr(self): - lXtr_raw = {k: [] for k in self.langs} - lYtr_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXtr_raw[lang] = self.l_index[lang].train_raw - lYtr_raw[lang] = self.l_index[lang].train_raw - return lXtr_raw - - def get_raw_lXva(self): - lXva_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXva_raw[lang] = self.l_index[lang].val_raw - - return lXva_raw - - def get_raw_lXte(self): - lXte_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXte_raw[lang] = self.l_index[lang].test_raw - - return lXte_raw - - def get_lXtr(self): - if not hasattr(self, 'lXtr'): - self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) - return self.lXtr - - def get_lXva(self): - if not hasattr(self, 'lXva'): - self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) - return self.lXva - - def get_lXte(self): - if not hasattr(self, 'lXte'): - self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) - return self.lXte - - def get_target_dim(self): - return self.l_index[self.langs[0]].devel_target.shape[1] - - def l_vocabsize(self): - return {l: index.vocabsize for l, index in self.l_index.items()} - - def l_embeddings(self): - return {l: index.embedding_matrix for l, index in self.l_index.items()} - - def l_pad(self): - return {l: index.pad_index for l, index in self.l_index.items()} - - def l_train_index(self): - return {l: index.train_index for l, index in self.l_index.items()} - - def l_train_raw_index(self): - return {l: index.train_raw for l, index in self.l_index.items()} - - def l_train_target(self): - return {l: index.train_target for l, index in self.l_index.items()} - - def l_val_index(self): - return {l: index.val_index for l, index in self.l_index.items()} - - def l_val_raw_index(self): - return {l: index.val_raw for l, index in self.l_index.items()} - - def l_test_raw_index(self): - return {l: index.test_raw for l, index in self.l_index.items()} - - def l_devel_raw_index(self): - return {l: index.devel_raw for l, index in self.l_index.items()} - - def l_val_target(self): - return {l: index.val_target for l, index in self.l_index.items()} - - def l_test_target(self): - return {l: index.test_target for l, index in self.l_index.items()} - - def l_test_index(self): - return {l: index.test_index for l, index in self.l_index.items()} - - def l_devel_index(self): - return {l: index.devel_index for l, index in self.l_index.items()} - - def l_devel_target(self): - return {l: index.devel_target for l, index in self.l_index.items()} - - def l_train(self): - return self.l_train_index(), self.l_train_target() - - def l_val(self): - return self.l_val_index(), self.l_val_target() - - def l_test(self): - return self.l_test_index(), self.l_test_target() - - def l_train_raw(self): - return self.l_train_raw_index(), self.l_train_target() - - def l_val_raw(self): - return self.l_val_raw_index(), self.l_val_target() - - def l_test_raw(self): - return self.l_test_raw_index(), self.l_test_target() - - def l_devel_raw(self): - return self.l_devel_raw_index(), self.l_devel_target() - - def get_l_pad_index(self): - return {l: index.get_pad_index() for l, index in self.l_index.items()} - - -class Index: - def __init__(self, devel_raw, devel_target, test_raw, test_target, lang): - """ - Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into - training and validation. - :param devel_raw: list of strings, list of raw training texts - :param devel_target: - :param test_raw: list of strings, list of raw test texts - :param lang: list, list of languages contained in the dataset - """ - self.lang = lang - self.devel_raw = devel_raw - self.devel_target = devel_target - self.test_raw = test_raw - self.test_target = test_target - - def index(self, pretrained_vocabulary, analyzer, vocabulary): - self.word2index = dict(vocabulary) - known_words = set(self.word2index.keys()) - if pretrained_vocabulary is not None: - known_words.update(pretrained_vocabulary) - - self.word2index['UNKTOKEN'] = len(self.word2index) - self.word2index['PADTOKEN'] = len(self.word2index) - self.unk_index = self.word2index['UNKTOKEN'] - self.pad_index = self.word2index['PADTOKEN'] - - # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) - self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, - self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, - self.out_of_vocabulary) - - self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) - - print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') - - def get_pad_index(self): - return self.pad_index - - def train_val_split(self, val_prop, max_val, seed): - devel = self.devel_index - target = self.devel_target - devel_raw = self.devel_raw - - val_size = int(min(len(devel) * val_prop, max_val)) - - self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ - train_test_split( - devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) - - print( - f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') - - def get_word_list(self): - def extract_word_list(word2index): - return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])] - - word_list = extract_word_list(self.word2index) - word_list += extract_word_list(self.out_of_vocabulary) - return word_list - - def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): - print(f'[generating embedding matrix for lang {self.lang}]') - - self.wce_range = None - embedding_parts = [] - - if pretrained is not None: - print('\t[pretrained-matrix]') - embedding_parts.append(pretrained) - del pretrained - - if supervised: - print('\t[supervised-matrix]') - F = supervised_embeddings_tfidf(Xtr, Ytr) - num_missing_rows = self.vocabsize - F.shape[0] - F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) - F = torch.from_numpy(F).float() - - offset = 0 - if embedding_parts: - offset = embedding_parts[0].shape[1] - self.wce_range = [offset, offset + F.shape[1]] - embedding_parts.append(F) - - self.embedding_matrix = torch.cat(embedding_parts, dim=1) - - print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') - - -def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): - """ - Index (i.e., replaces word strings with numerical indexes) a list of string documents - :param data: list of string documents - :param vocab: a fixed mapping [str]->[int] of words to indexes - :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained - because they are anyway contained in a pre-trained embedding set that we know in advance) - :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words - :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep - :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that - are not in the original vocab but that are in the known_words - :return: - """ - indexes = [] - vocabsize = len(vocab) - unk_count = 0 - knw_count = 0 - out_count = 0 - # pbar = tqdm(data, desc=f'indexing') - for text in data: - words = analyzer(text) - index = [] - for word in words: - if word in vocab: - idx = vocab[word] - else: - if word in known_words: - if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary) - idx = out_of_vocabulary[word] - out_count += 1 - else: - idx = unk_index - unk_count += 1 - index.append(idx) - indexes.append(index) - knw_count += len(index) - # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' - # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') - return indexes - - -def is_true(tensor, device): - return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) - - -def is_false(tensor, device): - return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) - - -def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths) + np.std(lengths)) - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i, indexes in enumerate(index_list): - index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] - return index_list - - -def get_params(optimc=False): - if not optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - - -def get_method_name(args): - _id = '' - _id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder] - _id_name = ['X', 'W', 'M', 'B', 'G'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id if not args.gru_wce else _id + '_wce' - _dataset_path = args.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - return _id, dataset_id diff --git a/refactor/util/embeddings_manager.py b/refactor/util/embeddings_manager.py deleted file mode 100644 index 1d708fa..0000000 --- a/refactor/util/embeddings_manager.py +++ /dev/null @@ -1,104 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np -import torch -from torchtext.vocab import Vectors - -from util.SIF_embed import remove_pc - - -class PretrainedEmbeddings(ABC): - - def __init__(self): - super().__init__() - - @abstractmethod - def vocabulary(self): pass - - @abstractmethod - def dim(self): pass - - @classmethod - def reindex(cls, words, word2index): - if isinstance(words, dict): - words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] - - source_idx, target_idx = [], [] - for i, word in enumerate(words): - if word not in word2index: - continue - j = word2index[word] - source_idx.append(i) - target_idx.append(j) - source_idx = np.asarray(source_idx) - target_idx = np.asarray(target_idx) - return source_idx, target_idx - - -class MuseLoader: - def __init__(self, langs, cache): - self.langs = langs - self.lEmbed = {} - self.lExtracted = {} - for lang in self.langs: - print(f'Loading vectors for {lang}...') - self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache) - - def dim(self): - return self.lEmbed[list(self.lEmbed.keys())[0]].dim - - def vocabulary(self): - return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs} - - def extract(self, lVoc): - """ - Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes - are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer) - :param lVoc: dict {lang : {word : id}} - :return: torch embedding matrix of extracted embeddings i.e., words in lVoc - """ - for lang, words in lVoc.items(): - print(f'Extracting words for lang {lang}...') - # words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0] - source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi) - extraction = torch.zeros((len(words), self.dim())) - extraction[source_id] = self.lEmbed[lang].vectors[target_id] - self.lExtracted[lang] = extraction - return self.lExtracted - - def get_lEmbeddings(self): - return {lang: self.lEmbed[lang].vectors for lang in self.langs} - - -def XdotM(X, M, sif): - E = X.dot(M) - if sif: - E = remove_pc(E, npc=1) - return E - - -def wce_matrix(X, Y): - wce = supervised_embeddings_tfidf(X, Y) - wce = zscores(wce, axis=0) - return wce - - -def supervised_embeddings_tfidf(X, Y): - tfidf_norm = X.sum(axis=0) - tfidf_norm[tfidf_norm == 0] = 1 - F = (X.T).dot(Y) / tfidf_norm.T - return F - - -def zscores(X, axis=0): - """ - scipy.stats.zscores does not avoid division by 0, which can indeed occur - :param X: - :param axis: - :return: - """ - std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) - mean = np.mean(X, axis=axis) - return (X - mean) / std - - diff --git a/refactor/util/evaluation.py b/refactor/util/evaluation.py deleted file mode 100644 index 010d0e9..0000000 --- a/refactor/util/evaluation.py +++ /dev/null @@ -1,20 +0,0 @@ -import numpy as np -from joblib import Parallel, delayed - -from util.metrics import * - - -def evaluation_metrics(y, y_): - if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label - raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers - return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) - - -def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): - if n_jobs == 1: - return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} - else: - langs = list(ly_true.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} diff --git a/refactor/util/file.py b/refactor/util/file.py deleted file mode 100644 index 8754f5a..0000000 --- a/refactor/util/file.py +++ /dev/null @@ -1,50 +0,0 @@ -import urllib -from os import listdir, makedirs -from os.path import isdir, isfile, join, exists, dirname -from pathlib import Path - - -def download_file(url, archive_filename): - def progress(blocknum, bs, size): - total_sz_mb = '%.2f MB' % (size / 1e6) - current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) - print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') - print("Downloading %s" % url) - urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) - print("") - - -def download_file_if_not_exists(url, archive_path): - if exists(archive_path): return - makedirs_if_not_exist(dirname(archive_path)) - download_file(url,archive_path) - - -def ls(dir, typecheck): - el = [f for f in listdir(dir) if typecheck(join(dir, f))] - el.sort() - return el - - -def list_dirs(dir): - return ls(dir, typecheck=isdir) - - -def list_files(dir): - return ls(dir, typecheck=isfile) - - -def makedirs_if_not_exist(path): - if not exists(path): makedirs(path) - - -def create_if_not_exist(path): - if not exists(path): makedirs(path) - - -def get_parent_name(path): - return Path(path).parent - - -def get_file_name(path): - return Path(path).name diff --git a/refactor/util/metrics.py b/refactor/util/metrics.py deleted file mode 100644 index 7a6079e..0000000 --- a/refactor/util/metrics.py +++ /dev/null @@ -1,152 +0,0 @@ -import numpy as np - - -class ContTable: - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp = tp - self.tn = tn - self.fp = fp - self.fn = fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - def __add__(self, other): - return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) - - -def accuracy(cell): - return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) - - -def f1(cell): - num = 2.0 * cell.tp - den = 2.0 * cell.tp + cell.fp + cell.fn - if den > 0: - return num / den - # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative - return 1.0 - - -def K(cell): - specificity, recall = 0., 0. - - AN = cell.tn + cell.fp - if AN != 0: - specificity = cell.tn*1. / AN - - AP = cell.tp + cell.fn - if AP != 0: - recall = cell.tp*1. / AP - - if AP == 0: - return 2. * specificity - 1. - elif AN == 0: - return 2. * recall - 1. - else: - return specificity + recall - 1. - - -# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared -# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. -def __check_consistency_and_adapt(true_labels, predictions): - if predictions.ndim == 1: - return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) - if true_labels.ndim == 1: - return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions) - if true_labels.shape != predictions.shape: - raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." - % (true_labels.shape, predictions.shape)) - _, nC = true_labels.shape - return true_labels, predictions, nC - - -# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir -# probabilitiesfron with respect to the true binary labels -# true_labels and posterior_probabilities are two vectors of shape (number_documents,) -def soft_single_metric_statistics(true_labels, posterior_probabilities): - assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels." - tp = np.sum(posterior_probabilities[true_labels == 1]) - fn = np.sum(1. - posterior_probabilities[true_labels == 1]) - fp = np.sum(posterior_probabilities[true_labels == 0]) - tn = np.sum(1. - posterior_probabilities[true_labels == 0]) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - - -# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions -# true_labels and predicted_labels are two vectors of shape (number_documents,) -def hard_single_metric_statistics(true_labels, predicted_labels): - assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." - nd = len(true_labels) - tp = np.sum(predicted_labels[true_labels == 1]) - fp = np.sum(predicted_labels[true_labels == 0]) - fn = np.sum(true_labels[predicted_labels == 0]) - tn = nd - (tp+fp+fn) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - - -def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) - - -def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - - accum = ContTable() - for c in range(nC): - other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) - accum = accum + other - - return metric(accum) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroF1(true_labels, predicted_labels): - return macro_average(true_labels, predicted_labels, f1) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microF1(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, f1) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroK(true_labels, predicted_labels): - return macro_average(true_labels, predicted_labels, K) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microK(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, K) diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py deleted file mode 100644 index bf8aa99..0000000 --- a/refactor/util/pl_metrics.py +++ /dev/null @@ -1,141 +0,0 @@ -import torch -from pytorch_lightning.metrics import Metric - -from util.common import is_false, is_true - - -def _update(pred, target, device): - assert pred.shape == target.shape - # preparing preds and targets for count - true_pred = is_true(pred, device) - false_pred = is_false(pred, device) - true_target = is_true(target, device) - false_target = is_false(target, device) - - tp = torch.sum(true_pred * true_target, dim=0) - tn = torch.sum(false_pred * false_target, dim=0) - fp = torch.sum(true_pred * false_target, dim=0) - fn = torch.sum(false_pred * target, dim=0) - return tp, tn, fp, fn - - -class CustomF1(Metric): - def __init__(self, num_classes, device, average='micro'): - """ - Custom F1 metric. - Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. - I.e., when the number of true positives, false positives, and false negatives amount to 0, all - affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. - We adhere to the common practice of outputting 1 in this case since the classifier has correctly - classified all examples as negatives. - :param num_classes: - :param device: - :param average: - """ - super().__init__() - self.num_classes = num_classes - self.average = average - self.device = 'cuda' if device else 'cpu' - self.add_state('true_positive', default=torch.zeros(self.num_classes)) - self.add_state('true_negative', default=torch.zeros(self.num_classes)) - self.add_state('false_positive', default=torch.zeros(self.num_classes)) - self.add_state('false_negative', default=torch.zeros(self.num_classes)) - - def update(self, preds, target): - true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) - - self.true_positive += true_positive - self.true_negative += true_negative - self.false_positive += false_positive - self.false_negative += false_negative - - def compute(self): - if self.average == 'micro': - num = 2.0 * self.true_positive.sum() - den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() - if den > 0: - return (num / den).to(self.device) - return torch.FloatTensor([1.]).to(self.device) - if self.average == 'macro': - class_specific = [] - for i in range(self.num_classes): - class_tp = self.true_positive[i] - class_tn = self.true_negative[i] - class_fp = self.false_positive[i] - class_fn = self.false_negative[i] - num = 2.0 * class_tp - den = 2.0 * class_tp + class_fp + class_fn - if den > 0: - class_specific.append(num / den) - else: - class_specific.append(1.) - average = torch.sum(torch.Tensor(class_specific))/self.num_classes - return average.to(self.device) - - -class CustomK(Metric): - def __init__(self, num_classes, device, average='micro'): - """ - K metric. https://dl.acm.org/doi/10.1145/2808194.2809449 - :param num_classes: - :param device: - :param average: - """ - super().__init__() - self.num_classes = num_classes - self.average = average - self.device = 'cuda' if device else 'cpu' - self.add_state('true_positive', default=torch.zeros(self.num_classes)) - self.add_state('true_negative', default=torch.zeros(self.num_classes)) - self.add_state('false_positive', default=torch.zeros(self.num_classes)) - self.add_state('false_negative', default=torch.zeros(self.num_classes)) - - def update(self, preds, target): - true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) - - self.true_positive += true_positive - self.true_negative += true_negative - self.false_positive += false_positive - self.false_negative += false_negative - - def compute(self): - if self.average == 'micro': - specificity, recall = 0., 0. - absolute_negatives = self.true_negative.sum() + self.false_positive.sum() - if absolute_negatives != 0: - specificity = self.true_negative.sum()/absolute_negatives - absolute_positives = self.true_positive.sum() + self.false_negative.sum() - if absolute_positives != 0: - recall = self.true_positive.sum()/absolute_positives - - if absolute_positives == 0: - return 2. * specificity - 1 - elif absolute_negatives == 0: - return 2. * recall - 1 - else: - return specificity + recall - 1 - - if self.average == 'macro': - class_specific = [] - for i in range(self.num_classes): - class_tp = self.true_positive[i] - class_tn = self.true_negative[i] - class_fp = self.false_positive[i] - class_fn = self.false_negative[i] - - specificity, recall = 0., 0. - absolute_negatives = class_tn + class_fp - if absolute_negatives != 0: - specificity = class_tn / absolute_negatives - absolute_positives = class_tp + class_fn - if absolute_positives != 0: - recall = class_tp / absolute_positives - - if absolute_positives == 0: - class_specific.append(2. * specificity - 1) - elif absolute_negatives == 0: - class_specific.append(2. * recall - 1) - else: - class_specific.append(specificity + recall - 1) - average = torch.sum(torch.Tensor(class_specific)) / self.num_classes - return average.to(self.device) diff --git a/refactor/util/results_csv.py b/refactor/util/results_csv.py deleted file mode 100644 index be0ff84..0000000 --- a/refactor/util/results_csv.py +++ /dev/null @@ -1,53 +0,0 @@ -import os - -import numpy as np -import pandas as pd - - -class CSVlog: - def __init__(self, file, autoflush=True, verbose=False): - self.file = file - self.columns = ['method', - 'setting', - 'optimc', - 'sif', - 'zscore', - 'l2', - 'dataset', - 'time_tr', - 'time_te', - 'lang', - 'macrof1', - 'microf1', - 'macrok', - 'microk', - 'notes'] - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file): - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - else: - self.tell('File {} does not exist. Creating new frame.'.format(file)) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - - def already_calculated(self, id): - return (self.df['id'] == id).any() - - def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, - macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, - macrof1, microf1, macrok, microk, notes], - index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - self.tell(s.to_string()) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: - print(msg) diff --git a/refactor/util/standardizer.py b/refactor/util/standardizer.py deleted file mode 100644 index 429bccd..0000000 --- a/refactor/util/standardizer.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np - - -class StandardizeTransformer: - def __init__(self, axis=0, range=None): - """ - - :param axis: - :param range: - """ - assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' - self.axis = axis - self.yetfit = False - self.range = range - - def fit(self, X): - print('Applying z-score standardization...') - std=np.std(X, axis=self.axis, ddof=1) - self.std = np.clip(std, 1e-5, None) - self.mean = np.mean(X, axis=self.axis) - if self.range is not None: - ones = np.ones_like(self.std) - zeros = np.zeros_like(self.mean) - ones[self.range] = self.std[self.range] - zeros[self.range] = self.mean[self.range] - self.std = ones - self.mean = zeros - self.yetfit=True - return self - - def transform(self, X): - if not self.yetfit: 'transform called before fit' - return (X - self.mean) / self.std - - def fit_transform(self, X): - return self.fit(X).transform(X) \ No newline at end of file diff --git a/refactor/view_generators.py b/refactor/view_generators.py deleted file mode 100644 index 384ec76..0000000 --- a/refactor/view_generators.py +++ /dev/null @@ -1,375 +0,0 @@ -""" -This module contains the view generators that take care of computing the view specific document embeddings: - -- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. - -- WordClassGen (-w): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - -- MuseGen (-m): generates document representation via MUSE embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - -- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). - -- View generator (-b): generates document embedding via mBERT model. -""" -from abc import ABC, abstractmethod -from time import time - -from pytorch_lightning import Trainer -from pytorch_lightning.loggers import TensorBoardLogger - -from data.datamodule import RecurrentDataModule, BertDataModule, tokenize -from models.learners import * -from models.pl_bert import BertModel -from models.pl_gru import RecurrentModel -from util.common import TfidfVectorizerMultilingual, _normalize -from util.embeddings_manager import MuseLoader, XdotM, wce_matrix - - -class ViewGen(ABC): - """ - Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to - be seamlessly integrated in the overall architecture. - """ - @abstractmethod - def fit(self, lX, ly): - pass - - @abstractmethod - def transform(self, lX): - pass - - @abstractmethod - def fit_transform(self, lX, ly): - pass - - -class VanillaFunGen(ViewGen): - """ - View Generator (x): original funnelling architecture proposed by Moreo, Esuli and - Sebastiani in DOI: https://doi.org/10.1145/3326065 - """ - def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): - """ - Init Posterior Probabilities embedder (i.e., VanillaFunGen) - :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to - return posterior probabilities. - :param base_learner: - :param n_jobs: integer, number of concurrent workers - """ - super().__init__() - self.learners = base_learner - self.first_tier_parameters = first_tier_parameters - self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners, - parameters=self.first_tier_parameters, n_jobs=self.n_jobs) - self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def fit(self, lX, lY): - print('# Fitting VanillaFunGen (X)...') - lX = self.vectorizer.fit_transform(lX) - self.doc_projector.fit(lX, lY) - return self - - def transform(self, lX): - """ - (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization - to the projection and returns it. - :param lX: dict {lang: indexed documents} - :return: document projection to the common latent space. - """ - lX = self.vectorizer.transform(lX) - lZ = self.doc_projector.predict_proba(lX) - lZ = _normalize(lZ, l2=True) - return lZ - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class MuseGen(ViewGen): - """ - View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - def __init__(self, muse_dir='../embeddings', n_jobs=-1): - """ - Init the MuseGen. - :param muse_dir: string, path to folder containing muse embeddings - :param n_jobs: int, number of concurrent workers - """ - super().__init__() - self.muse_dir = muse_dir - self.n_jobs = n_jobs - self.langs = None - self.lMuse = None - self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def fit(self, lX, ly): - """ - (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting MuseGen (M)...') - self.vectorizer.fit(lX) - self.langs = sorted(lX.keys()) - self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) - lVoc = self.vectorizer.vocabulary() - self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words - # TODO: featureweight.fit - return self - - def transform(self, lX): - """ - (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level, - finally (3) Apply L2 normalization embedding and returns it. - :param lX: dict {lang: indexed documents} - :return: document projection to the common latent space. - """ - lX = self.vectorizer.transform(lX) - XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) - lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} - lZ = _normalize(lZ, l2=True) - return lZ - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class WordClassGen(ViewGen): - """ - View Generator (w): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - def __init__(self, n_jobs=-1): - """ - Init WordClassGen. - :param n_jobs: int, number of concurrent workers - """ - super().__init__() - self.n_jobs = n_jobs - self.langs = None - self.lWce = None - self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def fit(self, lX, ly): - """ - (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting WordClassGen (W)...') - lX = self.vectorizer.fit_transform(lX) - self.langs = sorted(lX.keys()) - wce = Parallel(n_jobs=self.n_jobs)( - delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs) - self.lWce = {l: wce[i] for i, l in enumerate(self.langs)} - # TODO: featureweight.fit() - return self - - def transform(self, lX): - """ - (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level, - finally (3) Apply L2 normalization embedding and returns it. - :param lX: dict {lang: indexed documents} - :return: document projection to the common latent space. - """ - lX = self.vectorizer.transform(lX) - XdotWce = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) - lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)} - lWce = _normalize(lWce, l2=True) - return lWce - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class RecurrentGen(ViewGen): - """ - View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns - the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. - """ - def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, - gpus=0, n_jobs=-1, stored_path=None): - """ - Init RecurrentGen. - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use - as embedding layer. - :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised - embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to - the number of target classes. - :param batch_size: int, number of samples in a batch. - :param nepochs: int, number of max epochs to train the model. - :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. - :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). - :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. - """ - super().__init__() - self.multilingualIndex = multilingualIndex - self.langs = multilingualIndex.langs - self.batch_size = batch_size - self.gpus = gpus - self.n_jobs = n_jobs - self.stored_path = stored_path - self.nepochs = nepochs - - # EMBEDDINGS to be deployed - self.pretrained = pretrained_embeddings - self.wce = wce - - self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) - self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) - self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) - # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') - - def _init_model(self): - if self.stored_path: - lpretrained = self.multilingualIndex.l_embeddings() - return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained) - else: - lpretrained = self.multilingualIndex.l_embeddings() - langs = self.multilingualIndex.langs - output_size = self.multilingualIndex.get_target_dim() - hidden_size = 512 - lvocab_size = self.multilingualIndex.l_vocabsize() - learnable_length = 0 - return RecurrentModel( - lPretrained=lpretrained, - langs=langs, - output_size=output_size, - hidden_size=hidden_size, - lVocab_size=lvocab_size, - learnable_length=learnable_length, - drop_embedding_range=self.multilingualIndex.sup_range, - drop_embedding_prop=0.5, - gpus=self.gpus - ) - - def fit(self, lX, ly): - """ - Train the Neural Network end-to-end. - lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation - of the Dataset object (RecurrentDataset) in the GfunDataModule class. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting RecurrentGen (G)...') - recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) - trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, - checkpoint_callback=False) - - # vanilla_torch_model = torch.load( - # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') - # self.model.linear0 = vanilla_torch_model.linear0 - # self.model.linear1 = vanilla_torch_model.linear1 - # self.model.linear2 = vanilla_torch_model.linear2 - # self.model.rnn = vanilla_torch_model.rnn - - trainer.fit(self.model, datamodule=recurrentDataModule) - trainer.test(self.model, datamodule=recurrentDataModule) - return self - - def transform(self, lX): - """ - Project documents to the common latent space. Output dimensionality is 512. - :param lX: dict {lang: indexed documents} - :return: documents projected to the common latent space. - """ - l_pad = self.multilingualIndex.l_pad() - data = self.multilingualIndex.l_devel_index() - self.model.to('cuda' if self.gpus else 'cpu') - self.model.eval() - time_init = time() - l_embeds = self.model.encode(data, l_pad, batch_size=256) - transform_time = round(time() - time_init, 3) - print(f'Executed! Transform took: {transform_time}') - return l_embeds - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class BertGen(ViewGen): - """ - View Generator (b): generates document embedding via Bert model. The training happens end-to-end. - At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document - embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. - """ - def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): - """ - Init Bert model - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param batch_size: int, number of samples per batch. - :param nepochs: int, number of max epochs to train the model. - :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. - :param n_jobs: int, number of concurrent workers. - :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. - """ - super().__init__() - self.multilingualIndex = multilingualIndex - self.nepochs = nepochs - self.gpus = gpus - self.batch_size = batch_size - self.n_jobs = n_jobs - self.stored_path = stored_path - self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) - - def _init_model(self): - output_size = self.multilingualIndex.get_target_dim() - return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) - - def fit(self, lX, ly): - """ - Train the Neural Network end-to-end. - lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation - of the Dataset object (RecurrentDataset) in the GfunDataModule class. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting BertGen (M)...') - self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) - bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) - trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, - logger=self.logger, checkpoint_callback=False) - trainer.fit(self.model, datamodule=bertDataModule) - trainer.test(self.model, datamodule=bertDataModule) - return self - - def transform(self, lX): - """ - Project documents to the common latent space. Output dimensionality is 768. - :param lX: dict {lang: indexed documents} - :return: documents projected to the common latent space. - """ - data = self.multilingualIndex.l_devel_raw_index() - data = tokenize(data, max_len=512) - self.model.to('cuda' if self.gpus else 'cpu') - self.model.eval() - time_init = time() - l_emebds = self.model.encode(data, batch_size=64) - transform_time = round(time() - time_init, 3) - print(f'Executed! Transform took: {transform_time}') - return l_emebds - - def fit_transform(self, lX, ly): - # we can assume that we have already indexed data for transform() since we are first calling fit() - return self.fit(lX, ly).transform(lX) - - From ce4e32aad210fadaf6911956bf80a6244d09881b Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 15:19:09 +0100 Subject: [PATCH 38/55] fixed imports --- src/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.py b/src/main.py index ebc43a3..e6b25eb 100644 --- a/src/main.py +++ b/src/main.py @@ -6,6 +6,7 @@ from util.common import MultilingualIndex, get_params, get_method_name from util.evaluation import evaluate from util.results_csv import CSVlog from view_generators import * +from time import time def main(args): From 20c76a210361b5176ed29af397b229d3327486b0 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 15:52:09 +0100 Subject: [PATCH 39/55] fixed imports --- src/main.py => main.py | 28 +++++++++++------------- src/requirements.txt => requirements.txt | 0 src/run.sh => run.sh | 0 src/data/datamodule.py | 6 ++--- src/data/dataset_builder.py | 9 ++++---- src/data/reader/jrcacquis_reader.py | 6 ++--- src/data/reader/rcv_reader.py | 4 ++-- src/data/reader/wikipedia_tools.py | 1 - src/data/text_preprocessor.py | 2 +- src/funnelling.py | 6 ++--- src/models/learners.py | 2 +- src/models/lstm_class.py | 3 +-- src/models/pl_bert.py | 4 ++-- src/models/pl_gru.py | 6 ++--- src/util/common.py | 2 +- src/util/embeddings_manager.py | 2 +- src/util/evaluation.py | 3 +-- src/util/pl_metrics.py | 2 +- src/view_generators.py | 26 ++++++++++------------ 19 files changed, 52 insertions(+), 60 deletions(-) rename src/main.py => main.py (92%) rename src/requirements.txt => requirements.txt (100%) rename src/run.sh => run.sh (100%) diff --git a/src/main.py b/main.py similarity index 92% rename from src/main.py rename to main.py index e6b25eb..f6bbeae 100644 --- a/src/main.py +++ b/main.py @@ -1,12 +1,11 @@ from argparse import ArgumentParser -from data.dataset_builder import MultilingualDataset -from funnelling import * -from util.common import MultilingualIndex, get_params, get_method_name -from util.evaluation import evaluate -from util.results_csv import CSVlog -from view_generators import * -from time import time +from src.data.dataset_builder import MultilingualDataset +from src.funnelling import * +from src.util.common import MultilingualIndex, get_params, get_method_name +from src.util.evaluation import evaluate +from src.util.results_csv import CSVlog +from src.view_generators import * def main(args): @@ -60,18 +59,17 @@ def main(args): # Training --------------------------------------- print('\n[Training Generalized Funnelling]') - time_init = time() - time_tr = time() + time_init = time.time() gfun.fit(lX, ly) - time_tr = round(time() - time_tr, 3) + time_tr = round(time.time() - time_init, 3) print(f'Training completed in {time_tr} seconds!') # Testing ---------------------------------------- print('\n[Testing Generalized Funnelling]') - time_te = time() + time_te = time.time() ly_ = gfun.predict(lXte) l_eval = evaluate(ly_true=lyte, ly_pred=ly_) - time_te = round(time() - time_te, 3) + time_te = round(time.time() - time_te, 3) print(f'Testing completed in {time_te} seconds!') # Logging --------------------------------------- @@ -101,7 +99,7 @@ def main(args): notes='') print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) - overall_time = round(time() - time_init, 3) + overall_time = round(time.time() - time_init, 3) exit(f'\nExecuted in: {overall_time} seconds!') @@ -112,7 +110,7 @@ if __name__ == '__main__': parser.add_argument('-o', '--output', dest='csv_dir', help='Result file (default ../csv_log/gfun_results.csv)', type=str, - default='csv_logs/gfun/gfun_results.csv') + default='../csv_logs/gfun/gfun_results.csv') parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', help='deploy posterior probabilities embedder to compute document embeddings', @@ -138,7 +136,7 @@ if __name__ == '__main__': help='Optimize SVMs C hyperparameter', default=False) - parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, + parser.add_argument('-n', '--nepochs', dest='nepochs', type=int, help='Number of max epochs to train Recurrent embedder (i.e., -g)') parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, diff --git a/src/requirements.txt b/requirements.txt similarity index 100% rename from src/requirements.txt rename to requirements.txt diff --git a/src/run.sh b/run.sh similarity index 100% rename from src/run.sh rename to run.sh diff --git a/src/data/datamodule.py b/src/data/datamodule.py index da6ec92..bf874c7 100644 --- a/src/data/datamodule.py +++ b/src/data/datamodule.py @@ -135,15 +135,15 @@ class RecurrentDataModule(pl.LightningDataModule): lPad_index=self.multilingualIndex.l_pad()) def train_dataloader(self): - return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, collate_fn=self.training_dataset.collate_fn) def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, collate_fn=self.val_dataset.collate_fn) def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, collate_fn=self.test_dataset.collate_fn) diff --git a/src/data/dataset_builder.py b/src/data/dataset_builder.py index 0e91316..90760cb 100644 --- a/src/data/dataset_builder.py +++ b/src/data/dataset_builder.py @@ -1,5 +1,4 @@ import itertools -import pickle import re from os.path import exists @@ -12,10 +11,10 @@ from sklearn.model_selection import train_test_split from sklearn.preprocessing import MultiLabelBinarizer from tqdm import tqdm -from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.jrcacquis_reader import * -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents +from src.data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from src.data.reader.jrcacquis_reader import * +from src.data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 +from src.data.text_preprocessor import NLTKStemTokenizer, preprocess_documents class MultilingualDataset: diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py index e911996..e1e3bc2 100644 --- a/src/data/reader/jrcacquis_reader.py +++ b/src/data/reader/jrcacquis_reader.py @@ -14,9 +14,9 @@ import rdflib from rdflib.namespace import RDF, SKOS from sklearn.datasets import get_data_home -from data.languages import JRC_LANGS -from data.languages import lang_set -from util.file import download_file, list_dirs, list_files +from src.data.languages import JRC_LANGS +from src.data.languages import lang_set +from src.util.file import download_file, list_dirs, list_files """ JRC Acquis' Nomenclature: diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py index b3db098..dc2462e 100644 --- a/src/data/reader/rcv_reader.py +++ b/src/data/reader/rcv_reader.py @@ -5,8 +5,8 @@ from zipfile import ZipFile import numpy as np -from util.file import download_file_if_not_exists -from util.file import list_files +from src.util.file import download_file_if_not_exists +from src.util.file import list_files """ RCV2's Nomenclature: diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py index 9558fb6..6ae89ff 100644 --- a/src/data/reader/wikipedia_tools.py +++ b/src/data/reader/wikipedia_tools.py @@ -11,7 +11,6 @@ from os.path import join from xml.sax.saxutils import escape import numpy as np - from util.file import list_dirs, list_files policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py index fcfddba..183df56 100644 --- a/src/data/text_preprocessor.py +++ b/src/data/text_preprocessor.py @@ -2,7 +2,7 @@ from nltk import word_tokenize from nltk.corpus import stopwords from nltk.stem import SnowballStemmer -from data.languages import NLTK_LANGMAP +from src.data.languages import NLTK_LANGMAP def preprocess_documents(documents, lang): diff --git a/src/funnelling.py b/src/funnelling.py index 812a937..ba2be1b 100644 --- a/src/funnelling.py +++ b/src/funnelling.py @@ -1,6 +1,6 @@ -from models.learners import * -from util.common import _normalize -from view_generators import VanillaFunGen +from src.models.learners import * +from src.util.common import _normalize +from src.view_generators import VanillaFunGen class DocEmbedderList: diff --git a/src/models/learners.py b/src/models/learners.py index 2654109..46737c6 100644 --- a/src/models/learners.py +++ b/src/models/learners.py @@ -7,7 +7,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC -from util.standardizer import StandardizeTransformer +from src.util.standardizer import StandardizeTransformer def get_learner(calibrate=False, kernel='linear', C=1): diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py index 7f2cf59..cd4000b 100755 --- a/src/models/lstm_class.py +++ b/src/models/lstm_class.py @@ -1,7 +1,6 @@ #taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -from torch.autograd import Variable - from models.helpers import * +from torch.autograd import Variable class RNNMultilingualClassifier(nn.Module): diff --git a/src/models/pl_bert.py b/src/models/pl_bert.py index afb28b5..a9b669f 100644 --- a/src/models/pl_bert.py +++ b/src/models/pl_bert.py @@ -3,8 +3,8 @@ import torch from torch.optim.lr_scheduler import StepLR from transformers import BertForSequenceClassification, AdamW -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK +from src.util.common import define_pad_length, pad +from src.util.pl_metrics import CustomF1, CustomK class BertModel(pl.LightningModule): diff --git a/src/models/pl_gru.py b/src/models/pl_gru.py index afb12e6..4adb148 100644 --- a/src/models/pl_gru.py +++ b/src/models/pl_gru.py @@ -7,9 +7,9 @@ from torch.autograd import Variable from torch.optim.lr_scheduler import StepLR from transformers import AdamW -from models.helpers import init_embeddings -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK +from src.models.helpers import init_embeddings +from src.util.common import define_pad_length, pad +from src.util.pl_metrics import CustomF1, CustomK class RecurrentModel(pl.LightningModule): diff --git a/src/util/common.py b/src/util/common.py index 61ac52f..913014c 100644 --- a/src/util/common.py +++ b/src/util/common.py @@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import normalize -from util.embeddings_manager import supervised_embeddings_tfidf +from src.util.embeddings_manager import supervised_embeddings_tfidf class TfidfVectorizerMultilingual: diff --git a/src/util/embeddings_manager.py b/src/util/embeddings_manager.py index 1d708fa..0526582 100644 --- a/src/util/embeddings_manager.py +++ b/src/util/embeddings_manager.py @@ -4,7 +4,7 @@ import numpy as np import torch from torchtext.vocab import Vectors -from util.SIF_embed import remove_pc +from src.util.SIF_embed import remove_pc class PretrainedEmbeddings(ABC): diff --git a/src/util/evaluation.py b/src/util/evaluation.py index 010d0e9..45b8b2b 100644 --- a/src/util/evaluation.py +++ b/src/util/evaluation.py @@ -1,7 +1,6 @@ -import numpy as np from joblib import Parallel, delayed -from util.metrics import * +from src.util.metrics import * def evaluation_metrics(y, y_): diff --git a/src/util/pl_metrics.py b/src/util/pl_metrics.py index bf8aa99..765a6a2 100644 --- a/src/util/pl_metrics.py +++ b/src/util/pl_metrics.py @@ -1,7 +1,7 @@ import torch from pytorch_lightning.metrics import Metric -from util.common import is_false, is_true +from src.util.common import is_false, is_true def _update(pred, target, device): diff --git a/src/view_generators.py b/src/view_generators.py index 384ec76..b0f70bf 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -21,12 +21,12 @@ from time import time from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger -from data.datamodule import RecurrentDataModule, BertDataModule, tokenize -from models.learners import * -from models.pl_bert import BertModel -from models.pl_gru import RecurrentModel -from util.common import TfidfVectorizerMultilingual, _normalize -from util.embeddings_manager import MuseLoader, XdotM, wce_matrix +from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize +from src.models.learners import * +from src.models.pl_bert import BertModel +from src.models.pl_gru import RecurrentModel +from src.util.common import TfidfVectorizerMultilingual, _normalize +from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix class ViewGen(ABC): @@ -232,7 +232,7 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') def _init_model(self): @@ -293,9 +293,9 @@ class RecurrentGen(ViewGen): data = self.multilingualIndex.l_devel_index() self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - time_init = time() + time_init = time.time() l_embeds = self.model.encode(data, l_pad, batch_size=256) - transform_time = round(time() - time_init, 3) + transform_time = round(time.time() - time_init, 3) print(f'Executed! Transform took: {transform_time}') return l_embeds @@ -328,7 +328,7 @@ class BertGen(ViewGen): self.n_jobs = n_jobs self.stored_path = stored_path self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) def _init_model(self): output_size = self.multilingualIndex.get_target_dim() @@ -362,14 +362,12 @@ class BertGen(ViewGen): data = tokenize(data, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - time_init = time() + time_init = time.time() l_emebds = self.model.encode(data, batch_size=64) - transform_time = round(time() - time_init, 3) + transform_time = round(time.time() - time_init, 3) print(f'Executed! Transform took: {transform_time}') return l_emebds def fit_transform(self, lX, ly): # we can assume that we have already indexed data for transform() since we are first calling fit() return self.fit(lX, ly).transform(lX) - - From bcc22fe44aabdf726d9fdf89773281b34091e827 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 16:17:17 +0100 Subject: [PATCH 40/55] fixed imports + readme.md --- main.py | 2 +- readme.md | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 readme.md diff --git a/main.py b/main.py index f6bbeae..aa10a2a 100644 --- a/main.py +++ b/main.py @@ -109,7 +109,7 @@ if __name__ == '__main__': parser.add_argument('dataset', help='Path to the dataset') parser.add_argument('-o', '--output', dest='csv_dir', - help='Result file (default ../csv_log/gfun_results.csv)', type=str, + help='Result file (default ../csv_logs/gfun/gfun_results.csv)', type=str, default='../csv_logs/gfun/gfun_results.csv') parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..3ed0c75 --- /dev/null +++ b/readme.md @@ -0,0 +1,40 @@ +``` +usage: main.py [-h] [-o CSV_DIR] [-x] [-w] [-m] [-b] [-g] [-c] [-n NEPOCHS] + [-j N_JOBS] [--muse_dir MUSE_DIR] [--gru_wce] + [--gru_dir GRU_DIR] [--bert_dir BERT_DIR] [--gpus GPUS] + dataset + +Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani (2020). + +positional arguments: + dataset Path to the dataset + +optional arguments: + -h, --help show this help message and exit + -o CSV_DIR, --output CSV_DIR + Result file (default ../csv_logs/gfun/gfun_results.csv) + -x, --post_embedder deploy posterior probabilities embedder to compute + document embeddings + -w, --wce_embedder deploy (supervised) Word-Class embedder to the compute + document embeddings + -m, --muse_embedder deploy (pretrained) MUSE embedder to compute document + embeddings + -b, --bert_embedder deploy multilingual Bert to compute document + embeddings + -g, --gru_embedder deploy a GRU in order to compute document embeddings + -c, --c_optimize Optimize SVMs C hyperparameter + -n NEPOCHS, --nepochs NEPOCHS + Number of max epochs to train Recurrent embedder + (i.e., -g) + -j N_JOBS, --n_jobs N_JOBS + Number of parallel jobs (default is -1, all) + --muse_dir MUSE_DIR Path to the MUSE polylingual word embeddings (default + ../embeddings) + --gru_wce Deploy WCE embedding as embedding layer of the GRU + View Generator + --gru_dir GRU_DIR Set the path to a pretrained GRU model (i.e., -g view + generator) + --bert_dir BERT_DIR Set the path to a pretrained mBERT model (i.e., -b + view generator) + --gpus GPUS specifies how many GPUs to use per node +``` \ No newline at end of file From c769f59d5adb1a673cfe384f4ad6ccc1ef3d3019 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 16:20:51 +0100 Subject: [PATCH 41/55] fixed imports + readme.md --- readme.md | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/readme.md b/readme.md index 3ed0c75..d3f5861 100644 --- a/readme.md +++ b/readme.md @@ -11,30 +11,18 @@ positional arguments: optional arguments: -h, --help show this help message and exit - -o CSV_DIR, --output CSV_DIR - Result file (default ../csv_logs/gfun/gfun_results.csv) - -x, --post_embedder deploy posterior probabilities embedder to compute - document embeddings - -w, --wce_embedder deploy (supervised) Word-Class embedder to the compute - document embeddings - -m, --muse_embedder deploy (pretrained) MUSE embedder to compute document - embeddings - -b, --bert_embedder deploy multilingual Bert to compute document - embeddings + -o, --output Result file (default ../csv_logs/gfun/gfun_results.csv) + -x, --post_embedder deploy posterior probabilities embedder to compute document embeddings + -w, --wce_embedder deploy (supervised) Word-Class embedder to the compute document embeddings + -m, --muse_embedder deploy (pretrained) MUSE embedder to compute document embeddings + -b, --bert_embedder deploy multilingual Bert to compute document embeddings -g, --gru_embedder deploy a GRU in order to compute document embeddings -c, --c_optimize Optimize SVMs C hyperparameter - -n NEPOCHS, --nepochs NEPOCHS - Number of max epochs to train Recurrent embedder - (i.e., -g) - -j N_JOBS, --n_jobs N_JOBS - Number of parallel jobs (default is -1, all) - --muse_dir MUSE_DIR Path to the MUSE polylingual word embeddings (default - ../embeddings) - --gru_wce Deploy WCE embedding as embedding layer of the GRU - View Generator - --gru_dir GRU_DIR Set the path to a pretrained GRU model (i.e., -g view - generator) - --bert_dir BERT_DIR Set the path to a pretrained mBERT model (i.e., -b - view generator) + -n, --nepochs Number of max epochs to train Recurrent embedder (i.e., -g) + -j, --n_jobs Number of parallel jobs (default is -1, all) + --muse_dir MUSE_DIR Path to the MUSE polylingual word embeddings (default ../embeddings) + --gru_wce Deploy WCE embedding as embedding layer of the GRU View Generator + --gru_dir GRU_DIR Set the path to a pretrained GRU model (i.e., -g view generator) + --bert_dir BERT_DIR Set the path to a pretrained mBERT model (i.e., -b view generator) --gpus GPUS specifies how many GPUs to use per node ``` \ No newline at end of file From c10a33806691968af48980925619de36ed654d89 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 16:25:58 +0100 Subject: [PATCH 42/55] fixed imports + readme.md --- readme.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/readme.md b/readme.md index d3f5861..310b500 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,22 @@ +# Generalized Funnelling (gFun) + +## Requirements +```commandline +transformers==2.11.0 +pandas==0.25.3 +numpy==1.17.4 +joblib==0.14.0 +tqdm==4.50.2 +pytorch_lightning==1.1.2 +torch==1.3.1 +nltk==3.4.5 +scipy==1.3.3 +rdflib==4.2.2 +torchtext==0.4.0 +scikit_learn==0.24.1 ``` + +```commandline usage: main.py [-h] [-o CSV_DIR] [-x] [-w] [-m] [-b] [-g] [-c] [-n NEPOCHS] [-j N_JOBS] [--muse_dir MUSE_DIR] [--gru_wce] [--gru_dir GRU_DIR] [--bert_dir BERT_DIR] [--gpus GPUS] From ffd870b4b4a8ec364fc1d062f6694c3da24b0073 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 16:27:00 +0100 Subject: [PATCH 43/55] readme.md --- readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/readme.md b/readme.md index 310b500..713dbad 100644 --- a/readme.md +++ b/readme.md @@ -16,6 +16,7 @@ torchtext==0.4.0 scikit_learn==0.24.1 ``` +## Usage ```commandline usage: main.py [-h] [-o CSV_DIR] [-x] [-w] [-m] [-b] [-g] [-c] [-n NEPOCHS] [-j N_JOBS] [--muse_dir MUSE_DIR] [--gru_wce] From 1a501949a16f1575bd5293eac8410ca741b4138f Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 17:23:54 +0100 Subject: [PATCH 44/55] readme.md --- main.py | 15 ++++++++++----- readme.md | 19 ++++++++++--------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index aa10a2a..42623bd 100644 --- a/main.py +++ b/main.py @@ -42,11 +42,11 @@ def main(args): if args.gru_embedder: rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, - nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs) + nepochs=args.nepochs_rnn, gpus=args.gpus, n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) if args.bert_embedder: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, n_jobs=args.n_jobs) embedder_list.append(bertEmbedder) # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier @@ -136,13 +136,18 @@ if __name__ == '__main__': help='Optimize SVMs C hyperparameter', default=False) - parser.add_argument('-n', '--nepochs', dest='nepochs', type=int, - help='Number of max epochs to train Recurrent embedder (i.e., -g)') - parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, help='Number of parallel jobs (default is -1, all)', default=-1) + parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, + help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150.', + default=150) + + parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, + help='Number of max epochs to train Bert model (i.e., -g), default 10', + default=10) + parser.add_argument('--muse_dir', dest='muse_dir', type=str, help='Path to the MUSE polylingual word embeddings (default ../embeddings)', default='../embeddings') diff --git a/readme.md b/readme.md index 713dbad..06c8633 100644 --- a/readme.md +++ b/readme.md @@ -30,18 +30,19 @@ positional arguments: optional arguments: -h, --help show this help message and exit - -o, --output Result file (default ../csv_logs/gfun/gfun_results.csv) + -o, --output result file (default ../csv_logs/gfun/gfun_results.csv) -x, --post_embedder deploy posterior probabilities embedder to compute document embeddings -w, --wce_embedder deploy (supervised) Word-Class embedder to the compute document embeddings -m, --muse_embedder deploy (pretrained) MUSE embedder to compute document embeddings -b, --bert_embedder deploy multilingual Bert to compute document embeddings -g, --gru_embedder deploy a GRU in order to compute document embeddings - -c, --c_optimize Optimize SVMs C hyperparameter - -n, --nepochs Number of max epochs to train Recurrent embedder (i.e., -g) - -j, --n_jobs Number of parallel jobs (default is -1, all) - --muse_dir MUSE_DIR Path to the MUSE polylingual word embeddings (default ../embeddings) - --gru_wce Deploy WCE embedding as embedding layer of the GRU View Generator - --gru_dir GRU_DIR Set the path to a pretrained GRU model (i.e., -g view generator) - --bert_dir BERT_DIR Set the path to a pretrained mBERT model (i.e., -b view generator) - --gpus GPUS specifies how many GPUs to use per node + -c, --c_optimize optimize SVMs C hyperparameter + -j, --n_jobs number of parallel jobs (default is -1, all) + --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150. + --nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10 + --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) + --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator + --gru_dir set the path to a pretrained GRU model (i.e., -g view generator) + --bert_dir set the path to a pretrained mBERT model (i.e., -b view generator) + --gpus specifies how many GPUs to use per node ``` \ No newline at end of file From 5cd36d27fc03d35967525e30cbd2be0f332429fe Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 18:04:15 +0100 Subject: [PATCH 45/55] fixed cuda oom at inference time --- src/models/pl_bert.py | 5 +++++ src/view_generators.py | 15 ++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/models/pl_bert.py b/src/models/pl_bert.py index a9b669f..129c3b4 100644 --- a/src/models/pl_bert.py +++ b/src/models/pl_bert.py @@ -163,6 +163,11 @@ class BertModel(pl.LightningModule): batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') _, output = self.forward(batch) + + # deleting batch from gpu to avoid cuda OOM + del batch + torch.cuda.empty_cache() + doc_embeds = output[-1][:, 0, :] l_embed[lang].append(doc_embeds.cpu()) for k, v in l_embed.items(): diff --git a/src/view_generators.py b/src/view_generators.py index b0f70bf..452714c 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -16,7 +16,7 @@ This module contains the view generators that take care of computing the view sp - View generator (-b): generates document embedding via mBERT model. """ from abc import ABC, abstractmethod -from time import time +# from time import time from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger @@ -27,6 +27,7 @@ from src.models.pl_bert import BertModel from src.models.pl_gru import RecurrentModel from src.util.common import TfidfVectorizerMultilingual, _normalize from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix +# TODO: add early stop monitoring validation macroF1 + model checkpointing and loading from checkpoint class ViewGen(ABC): @@ -293,10 +294,10 @@ class RecurrentGen(ViewGen): data = self.multilingualIndex.l_devel_index() self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - time_init = time.time() + # time_init = time.time() l_embeds = self.model.encode(data, l_pad, batch_size=256) - transform_time = round(time.time() - time_init, 3) - print(f'Executed! Transform took: {transform_time}') + # transform_time = round(time.time() - time_init, 3) + # print(f'Executed! Transform took: {transform_time}') return l_embeds def fit_transform(self, lX, ly): @@ -362,10 +363,10 @@ class BertGen(ViewGen): data = tokenize(data, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - time_init = time.time() + # time_init = time.time() l_emebds = self.model.encode(data, batch_size=64) - transform_time = round(time.time() - time_init, 3) - print(f'Executed! Transform took: {transform_time}') + # transform_time = round(time.time() - time_init, 3) + # print(f'Executed! Transform took: {transform_time}') return l_emebds def fit_transform(self, lX, ly): From e9a410faa44689438919376c793a0e18dc95667b Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 18:12:14 +0100 Subject: [PATCH 46/55] early stopping --- src/data/datamodule.py | 24 ++++++++++++------------ src/view_generators.py | 12 +++++++++--- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/data/datamodule.py b/src/data/datamodule.py index bf874c7..66146b3 100644 --- a/src/data/datamodule.py +++ b/src/data/datamodule.py @@ -112,24 +112,24 @@ class RecurrentDataModule(pl.LightningDataModule): if stage == 'fit' or stage is None: l_train_index, l_train_target = self.multilingualIndex.l_train() # Debug settings: reducing number of samples - l_train_index = {l: train[:5] for l, train in l_train_index.items()} - l_train_target = {l: target[:5] for l, target in l_train_target.items()} + # l_train_index = {l: train[:5] for l, train in l_train_index.items()} + # l_train_target = {l: target[:5] for l, target in l_train_target.items()} self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) l_val_index, l_val_target = self.multilingualIndex.l_val() # Debug settings: reducing number of samples - l_val_index = {l: train[:5] for l, train in l_val_index.items()} - l_val_target = {l: target[:5] for l, target in l_val_target.items()} + # l_val_index = {l: train[:5] for l, train in l_val_index.items()} + # l_val_target = {l: target[:5] for l, target in l_val_target.items()} self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) if stage == 'test' or stage is None: l_test_index, l_test_target = self.multilingualIndex.l_test() # Debug settings: reducing number of samples - l_test_index = {l: train[:5] for l, train in l_test_index.items()} - l_test_target = {l: target[:5] for l, target in l_test_target.items()} + # l_test_index = {l: train[:5] for l, train in l_test_index.items()} + # l_test_target = {l: target[:5] for l, target in l_test_target.items()} self.test_dataset = RecurrentDataset(l_test_index, l_test_target, lPad_index=self.multilingualIndex.l_pad()) @@ -182,8 +182,8 @@ class BertDataModule(RecurrentDataModule): if stage == 'fit' or stage is None: l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() # Debug settings: reducing number of samples - l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} - l_train_target = {l: target[:5] for l, target in l_train_target.items()} + # l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} + # l_train_target = {l: target[:5] for l, target in l_train_target.items()} l_train_index = tokenize(l_train_raw, max_len=self.max_len) self.training_dataset = RecurrentDataset(l_train_index, l_train_target, @@ -191,8 +191,8 @@ class BertDataModule(RecurrentDataModule): l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() # Debug settings: reducing number of samples - l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} - l_val_target = {l: target[:5] for l, target in l_val_target.items()} + # l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} + # l_val_target = {l: target[:5] for l, target in l_val_target.items()} l_val_index = tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, @@ -201,8 +201,8 @@ class BertDataModule(RecurrentDataModule): if stage == 'test' or stage is None: l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() # Debug settings: reducing number of samples - l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} - l_test_target = {l: target[:5] for l, target in l_test_target.items()} + # l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} + # l_test_target = {l: target[:5] for l, target in l_test_target.items()} l_test_index = tokenize(l_test_raw, max_len=self.max_len) self.test_dataset = RecurrentDataset(l_test_index, l_test_target, diff --git a/src/view_generators.py b/src/view_generators.py index 452714c..20a8045 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -20,6 +20,8 @@ from abc import ABC, abstractmethod from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.callbacks.early_stopping import EarlyStopping + from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize from src.models.learners import * @@ -27,7 +29,7 @@ from src.models.pl_bert import BertModel from src.models.pl_gru import RecurrentModel from src.util.common import TfidfVectorizerMultilingual, _normalize from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix -# TODO: add early stop monitoring validation macroF1 + model checkpointing and loading from checkpoint +# TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached class ViewGen(ABC): @@ -235,6 +237,8 @@ class RecurrentGen(ViewGen): self.model = self._init_model() self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') + self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, + patience=5, verbose=False, mode='max') def _init_model(self): if self.stored_path: @@ -271,7 +275,7 @@ class RecurrentGen(ViewGen): print('# Fitting RecurrentGen (G)...') recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, - checkpoint_callback=False) + callbacks=[self.early_stop_callback], checkpoint_callback=False) # vanilla_torch_model = torch.load( # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') @@ -330,6 +334,8 @@ class BertGen(ViewGen): self.stored_path = stored_path self.model = self._init_model() self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) + self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, + patience=5, verbose=False, mode='max') def _init_model(self): output_size = self.multilingualIndex.get_target_dim() @@ -348,7 +354,7 @@ class BertGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, - logger=self.logger, checkpoint_callback=False) + logger=self.logger, callbacks=[self.early_stop_callback], checkpoint_callback=False) trainer.fit(self.model, datamodule=bertDataModule) trainer.test(self.model, datamodule=bertDataModule) return self From 79cdaa0beb622e6c76957d1743a57d298c05e1a4 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 18:13:34 +0100 Subject: [PATCH 47/55] early stopping + typos --- src/view_generators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/view_generators.py b/src/view_generators.py index 20a8045..d014ef0 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -370,10 +370,10 @@ class BertGen(ViewGen): self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() # time_init = time.time() - l_emebds = self.model.encode(data, batch_size=64) + l_embeds = self.model.encode(data, batch_size=64) # transform_time = round(time.time() - time_init, 3) # print(f'Executed! Transform took: {transform_time}') - return l_emebds + return l_embeds def fit_transform(self, lX, ly): # we can assume that we have already indexed data for transform() since we are first calling fit() From bb84422d249fb04980efcefcc64fcfb2ebb70b53 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 18:56:24 +0100 Subject: [PATCH 48/55] early stopping + typos --- main.py | 20 ++++++++++---------- readme.md | 2 +- src/view_generators.py | 18 ++++++++++++------ 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/main.py b/main.py index 42623bd..b99f024 100644 --- a/main.py +++ b/main.py @@ -108,7 +108,7 @@ if __name__ == '__main__': parser.add_argument('dataset', help='Path to the dataset') - parser.add_argument('-o', '--output', dest='csv_dir', + parser.add_argument('-o', '--output', dest='csv_dir', metavar='', help='Result file (default ../csv_logs/gfun/gfun_results.csv)', type=str, default='../csv_logs/gfun/gfun_results.csv') @@ -133,22 +133,22 @@ if __name__ == '__main__': default=False) parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', - help='Optimize SVMs C hyperparameter', + help='Optimize SVMs C hyperparameter at metaclassifier level', default=False) - parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, + parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='', help='Number of parallel jobs (default is -1, all)', default=-1) - parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, - help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150.', + parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='', + help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150', default=150) - parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, + parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='', help='Number of max epochs to train Bert model (i.e., -g), default 10', default=10) - parser.add_argument('--muse_dir', dest='muse_dir', type=str, + parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='', help='Path to the MUSE polylingual word embeddings (default ../embeddings)', default='../embeddings') @@ -156,15 +156,15 @@ if __name__ == '__main__': help='Deploy WCE embedding as embedding layer of the GRU View Generator', default=False) - parser.add_argument('--gru_dir', dest='gru_dir', type=str, + parser.add_argument('--gru_dir', dest='gru_dir', type=str, metavar='', help='Set the path to a pretrained GRU model (i.e., -g view generator)', default=None) - parser.add_argument('--bert_dir', dest='bert_dir', type=str, + parser.add_argument('--bert_dir', dest='bert_dir', type=str, metavar='', help='Set the path to a pretrained mBERT model (i.e., -b view generator)', default=None) - parser.add_argument('--gpus', help='specifies how many GPUs to use per node', + parser.add_argument('--gpus', metavar='', help='specifies how many GPUs to use per node', default=None) args = parser.parse_args() diff --git a/readme.md b/readme.md index 06c8633..4569ba8 100644 --- a/readme.md +++ b/readme.md @@ -38,7 +38,7 @@ optional arguments: -g, --gru_embedder deploy a GRU in order to compute document embeddings -c, --c_optimize optimize SVMs C hyperparameter -j, --n_jobs number of parallel jobs (default is -1, all) - --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150. + --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150 --nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10 --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator diff --git a/src/view_generators.py b/src/view_generators.py index d014ef0..9b352f8 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -22,13 +22,13 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.callbacks.early_stopping import EarlyStopping - from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize from src.models.learners import * from src.models.pl_bert import BertModel from src.models.pl_gru import RecurrentModel from src.util.common import TfidfVectorizerMultilingual, _normalize from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix +from src.util.file import create_if_not_exist # TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached @@ -203,7 +203,7 @@ class RecurrentGen(ViewGen): the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. """ def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, - gpus=0, n_jobs=-1, stored_path=None): + gpus=0, n_jobs=-1, patience=5, stored_path=None): """ Init RecurrentGen. :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents @@ -217,6 +217,7 @@ class RecurrentGen(ViewGen): :param nepochs: int, number of max epochs to train the model. :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). + :param patience: int, number of epochs with no improvements in val-macroF1 before early stopping. :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. """ super().__init__() @@ -227,6 +228,7 @@ class RecurrentGen(ViewGen): self.n_jobs = n_jobs self.stored_path = stored_path self.nepochs = nepochs + self.patience = patience # EMBEDDINGS to be deployed self.pretrained = pretrained_embeddings @@ -238,7 +240,7 @@ class RecurrentGen(ViewGen): self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, - patience=5, verbose=False, mode='max') + patience=self.patience, verbose=False, mode='max') def _init_model(self): if self.stored_path: @@ -273,12 +275,13 @@ class RecurrentGen(ViewGen): :return: self. """ print('# Fitting RecurrentGen (G)...') + create_if_not_exist('../tb_logs') recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, callbacks=[self.early_stop_callback], checkpoint_callback=False) # vanilla_torch_model = torch.load( - # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') + # '../_old_checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle') # self.model.linear0 = vanilla_torch_model.linear0 # self.model.linear1 = vanilla_torch_model.linear1 # self.model.linear2 = vanilla_torch_model.linear2 @@ -314,7 +317,7 @@ class BertGen(ViewGen): At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. """ - def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): + def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None): """ Init Bert model :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents @@ -322,6 +325,7 @@ class BertGen(ViewGen): :param batch_size: int, number of samples per batch. :param nepochs: int, number of max epochs to train the model. :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param patience: int, number of epochs with no improvements in val-macroF1 before early stopping. :param n_jobs: int, number of concurrent workers. :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. """ @@ -333,9 +337,10 @@ class BertGen(ViewGen): self.n_jobs = n_jobs self.stored_path = stored_path self.model = self._init_model() + self.patience = patience self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, - patience=5, verbose=False, mode='max') + patience=self.patience, verbose=False, mode='max') def _init_model(self): output_size = self.multilingualIndex.get_target_dim() @@ -351,6 +356,7 @@ class BertGen(ViewGen): :return: self. """ print('# Fitting BertGen (M)...') + create_if_not_exist('../tb_logs') self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, From e52b153ad4e706b2dccc861ea349831716b58f7c Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 28 Jan 2021 18:12:20 +0100 Subject: [PATCH 49/55] fixed view generators' transform method --- main.py | 23 +++++++++++++++-------- readme.md | 3 ++- run.sh | 10 ++++++---- src/view_generators.py | 30 +++++++++++++++--------------- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/main.py b/main.py index b99f024..e236c50 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,7 @@ def main(args): print('Running generalized funnelling...') data = MultilingualDataset.load(args.dataset) - data.set_view(languages=['it', 'fr']) + # data.set_view(languages=['it', 'da']) data.show_dimensions() lX, ly = data.training() lXte, lyte = data.test() @@ -42,11 +42,14 @@ def main(args): if args.gru_embedder: rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, - nepochs=args.nepochs_rnn, gpus=args.gpus, n_jobs=args.n_jobs) + nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus, + n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) if args.bert_embedder: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, n_jobs=args.n_jobs) + bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, + n_jobs=args.n_jobs) + bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier @@ -137,20 +140,24 @@ if __name__ == '__main__': default=False) parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='', - help='Number of parallel jobs (default is -1, all)', + help='number of parallel jobs (default is -1, all)', default=-1) parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='', - help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150', + help='number of max epochs to train Recurrent embedder (i.e., -g), default 150', default=150) parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='', - help='Number of max epochs to train Bert model (i.e., -g), default 10', + help='number of max epochs to train Bert model (i.e., -g), default 10', default=10) + parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='', + help='set early stop patience for the RecurrentGen, default 50', + default=50) + parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='', - help='Path to the MUSE polylingual word embeddings (default ../embeddings)', - default='../embeddings') + help='Path to the MUSE polylingual word embeddings (default embeddings/)', + default='embeddings/') parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', help='Deploy WCE embedding as embedding layer of the GRU View Generator', diff --git a/readme.md b/readme.md index 4569ba8..401a883 100644 --- a/readme.md +++ b/readme.md @@ -37,11 +37,12 @@ optional arguments: -b, --bert_embedder deploy multilingual Bert to compute document embeddings -g, --gru_embedder deploy a GRU in order to compute document embeddings -c, --c_optimize optimize SVMs C hyperparameter - -j, --n_jobs number of parallel jobs (default is -1, all) + -j, --n_jobs number of parallel jobs, default is -1 i.e., all --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150 --nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10 --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator + --patience_rnn set early stop patience for the RecurrentGen, default 50 --gru_dir set the path to a pretrained GRU model (i.e., -g view generator) --bert_dir set the path to a pretrained mBERT model (i.e., -b view generator) --gpus specifies how many GPUs to use per node diff --git a/run.sh b/run.sh index 04365f9..fd7f4f0 100644 --- a/run.sh +++ b/run.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash -for i in {0..10..1} -do - python main.py --gpus 0 -done \ No newline at end of file +python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0 + +#for i in {0..10..1} +#do +# python main.py --gpus 0 +#done \ No newline at end of file diff --git a/src/view_generators.py b/src/view_generators.py index 9b352f8..fab56c7 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -26,10 +26,10 @@ from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize from src.models.learners import * from src.models.pl_bert import BertModel from src.models.pl_gru import RecurrentModel -from src.util.common import TfidfVectorizerMultilingual, _normalize +from src.util.common import TfidfVectorizerMultilingual, _normalize, index from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix from src.util.file import create_if_not_exist -# TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached +# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached class ViewGen(ABC): @@ -203,7 +203,7 @@ class RecurrentGen(ViewGen): the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. """ def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, - gpus=0, n_jobs=-1, patience=5, stored_path=None): + gpus=0, n_jobs=-1, patience=20, stored_path=None): """ Init RecurrentGen. :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents @@ -237,8 +237,7 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) - # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') + self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') @@ -297,14 +296,19 @@ class RecurrentGen(ViewGen): :param lX: dict {lang: indexed documents} :return: documents projected to the common latent space. """ + data = {} + for lang in lX.keys(): + indexed = index(data=lX[lang], + vocab=self.multilingualIndex.l_index[lang].word2index, + known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()), + analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang), + unk_index=self.multilingualIndex.l_index[lang].unk_index, + out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary) + data[lang] = indexed l_pad = self.multilingualIndex.l_pad() - data = self.multilingualIndex.l_devel_index() self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - # time_init = time.time() l_embeds = self.model.encode(data, l_pad, batch_size=256) - # transform_time = round(time.time() - time_init, 3) - # print(f'Executed! Transform took: {transform_time}') return l_embeds def fit_transform(self, lX, ly): @@ -338,7 +342,7 @@ class BertGen(ViewGen): self.stored_path = stored_path self.model = self._init_model() self.patience = patience - self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') @@ -371,14 +375,10 @@ class BertGen(ViewGen): :param lX: dict {lang: indexed documents} :return: documents projected to the common latent space. """ - data = self.multilingualIndex.l_devel_raw_index() - data = tokenize(data, max_len=512) + data = tokenize(lX, max_len=512) self.model.to('cuda' if self.gpus else 'cpu') self.model.eval() - # time_init = time.time() l_embeds = self.model.encode(data, batch_size=64) - # transform_time = round(time.time() - time_init, 3) - # print(f'Executed! Transform took: {transform_time}') return l_embeds def fit_transform(self, lX, ly): From 2c70f378239e522a830fa4b74116b38d545dac7a Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 29 Jan 2021 10:49:47 +0100 Subject: [PATCH 50/55] logging dirs --- src/view_generators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/view_generators.py b/src/view_generators.py index fab56c7..27da0fc 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -237,7 +237,7 @@ class RecurrentGen(ViewGen): self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') @@ -274,7 +274,7 @@ class RecurrentGen(ViewGen): :return: self. """ print('# Fitting RecurrentGen (G)...') - create_if_not_exist('../tb_logs') + create_if_not_exist(self.logger.save_dir) recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, callbacks=[self.early_stop_callback], checkpoint_callback=False) @@ -342,7 +342,7 @@ class BertGen(ViewGen): self.stored_path = stored_path self.model = self._init_model() self.patience = patience - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') @@ -360,7 +360,7 @@ class BertGen(ViewGen): :return: self. """ print('# Fitting BertGen (M)...') - create_if_not_exist('../tb_logs') + create_if_not_exist(self.logger.save_dir) self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, From 0e01d654cfc865af0f0a1288fc6f8e64556bec37 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 29 Jan 2021 16:57:21 +0100 Subject: [PATCH 51/55] lr monitor --- src/view_generators.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/view_generators.py b/src/view_generators.py index 27da0fc..af4ee8e 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -21,6 +21,7 @@ from abc import ABC, abstractmethod from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize from src.models.learners import * @@ -240,6 +241,7 @@ class RecurrentGen(ViewGen): self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, patience=self.patience, verbose=False, mode='max') + self.lr_monitor = LearningRateMonitor(logging_interval='epoch') def _init_model(self): if self.stored_path: @@ -277,7 +279,7 @@ class RecurrentGen(ViewGen): create_if_not_exist(self.logger.save_dir) recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, - callbacks=[self.early_stop_callback], checkpoint_callback=False) + callbacks=[self.early_stop_callback, self.lr_monitor], checkpoint_callback=False) # vanilla_torch_model = torch.load( # '../_old_checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle') From b3275667bb560e69eba4e237b56e183281af8188 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 29 Jan 2021 18:18:47 +0100 Subject: [PATCH 52/55] Set arguments in order to reproduce 'master' performances with Neural setting --- main.py | 30 +++++++++++++++++++++--------- readme.md | 6 +++++- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index e236c50..0650310 100644 --- a/main.py +++ b/main.py @@ -41,14 +41,14 @@ def main(args): embedder_list.append(wceEmbedder) if args.gru_embedder: - rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, - nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus, - n_jobs=args.n_jobs) + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, + batch_size=args.batch_rnn, nepochs=args.nepochs_rnn, patience=args.patience_rnn, + gpus=args.gpus, n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) if args.bert_embedder: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, - n_jobs=args.n_jobs) + bertEmbedder = BertGen(multilingualIndex, batch_size=args.batch_bert, nepochs=args.nepochs_bert, + patience=args.patience_bert, gpus=args.gpus, n_jobs=args.n_jobs) bertEmbedder.transform(lX) embedder_list.append(bertEmbedder) @@ -152,8 +152,20 @@ if __name__ == '__main__': default=10) parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='', - help='set early stop patience for the RecurrentGen, default 50', - default=50) + help='set early stop patience for the RecurrentGen, default 25', + default=25) + + parser.add_argument('--patience_bert', dest='patience_bert', type=int, metavar='', + help='set early stop patience for the BertGen, default 5', + default=5) + + parser.add_argument('--batch_rnn', dest='batch_rnn', type=int, metavar='', + help='set batchsize for the RecurrentGen, default 64', + default=64) + + parser.add_argument('--batch_bert', dest='batch_bert', type=int, metavar='', + help='set batchsize for the BertGen, default 4', + default=4) parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='', help='Path to the MUSE polylingual word embeddings (default embeddings/)', @@ -163,8 +175,8 @@ if __name__ == '__main__': help='Deploy WCE embedding as embedding layer of the GRU View Generator', default=False) - parser.add_argument('--gru_dir', dest='gru_dir', type=str, metavar='', - help='Set the path to a pretrained GRU model (i.e., -g view generator)', + parser.add_argument('--rnn_dir', dest='rnn_dir', type=str, metavar='', + help='Set the path to a pretrained RNN model (i.e., -g view generator)', default=None) parser.add_argument('--bert_dir', dest='bert_dir', type=str, metavar='', diff --git a/readme.md b/readme.md index 401a883..db6275f 100644 --- a/readme.md +++ b/readme.md @@ -40,10 +40,14 @@ optional arguments: -j, --n_jobs number of parallel jobs, default is -1 i.e., all --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150 --nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10 + --patience_rnn set early stop patience for the RecurrentGen, default 25 + --patience_bert set early stop patience for the BertGen, default 5 + --batch_rnn set batchsize for the RecurrentGen, default 64 + --batch_bert set batchsize for the BertGen, default 4 --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator --patience_rnn set early stop patience for the RecurrentGen, default 50 - --gru_dir set the path to a pretrained GRU model (i.e., -g view generator) + --rnn_dir set the path to a pretrained RNN model (i.e., -g view generator) --bert_dir set the path to a pretrained mBERT model (i.e., -b view generator) --gpus specifies how many GPUs to use per node ``` \ No newline at end of file From a301afde977e6ee3005b051b4a4bedb572e0dc62 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 29 Jan 2021 18:20:29 +0100 Subject: [PATCH 53/55] Set arguments in order to reproduce 'master' performances with Neural setting --- readme.md | 1 - 1 file changed, 1 deletion(-) diff --git a/readme.md b/readme.md index db6275f..d32fb61 100644 --- a/readme.md +++ b/readme.md @@ -46,7 +46,6 @@ optional arguments: --batch_bert set batchsize for the BertGen, default 4 --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator - --patience_rnn set early stop patience for the RecurrentGen, default 50 --rnn_dir set the path to a pretrained RNN model (i.e., -g view generator) --bert_dir set the path to a pretrained mBERT model (i.e., -b view generator) --gpus specifies how many GPUs to use per node From bca0b9ab7c8612d25d8f814c29ecf4358181b497 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 29 Jan 2021 18:25:01 +0100 Subject: [PATCH 54/55] Set arguments in order to reproduce 'master' performances with Neural setting --- main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 0650310..cff4887 100644 --- a/main.py +++ b/main.py @@ -41,7 +41,7 @@ def main(args): embedder_list.append(wceEmbedder) if args.gru_embedder: - rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.rnn_wce, batch_size=args.batch_rnn, nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus, n_jobs=args.n_jobs) embedder_list.append(rnnEmbedder) @@ -132,7 +132,7 @@ if __name__ == '__main__': default=False) parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', - help='deploy a GRU in order to compute document embeddings', + help='deploy a GRU in order to compute document embeddings (a.k.a., RecurrentGen)', default=False) parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', @@ -171,8 +171,8 @@ if __name__ == '__main__': help='Path to the MUSE polylingual word embeddings (default embeddings/)', default='embeddings/') - parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', - help='Deploy WCE embedding as embedding layer of the GRU View Generator', + parser.add_argument('--rnn_wce', dest='rnn_wce', action='store_true', + help='Deploy WCE embedding as embedding layer of the RecurrentGen', default=False) parser.add_argument('--rnn_dir', dest='rnn_dir', type=str, metavar='', From 1bff57ebbbb3e0b79686db13a56fdcd27f7571e2 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 2 Feb 2021 11:26:04 +0100 Subject: [PATCH 55/55] Fixed arguments --- main.py | 4 ++-- src/util/common.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index cff4887..49d450d 100644 --- a/main.py +++ b/main.py @@ -112,8 +112,8 @@ if __name__ == '__main__': parser.add_argument('dataset', help='Path to the dataset') parser.add_argument('-o', '--output', dest='csv_dir', metavar='', - help='Result file (default ../csv_logs/gfun/gfun_results.csv)', type=str, - default='../csv_logs/gfun/gfun_results.csv') + help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str, + default='csv_logs/gfun/gfun_results.csv') parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', help='deploy posterior probabilities embedder to compute document embeddings', diff --git a/src/util/common.py b/src/util/common.py index 913014c..9f44273 100644 --- a/src/util/common.py +++ b/src/util/common.py @@ -378,7 +378,7 @@ def get_method_name(args): for i, conf in enumerate(_id_conf): if conf: _id += _id_name[i] - _id = _id if not args.gru_wce else _id + '_wce' + _id = _id if not args.rnn_wce else _id + '_wce' _dataset_path = args.dataset.split('/')[-1].split('_') dataset_id = _dataset_path[0] + _dataset_path[-1] return _id, dataset_id