From f2083bf22a734df8390e37bce7df7b782012c5da Mon Sep 17 00:00:00 2001 From: andrea Date: Sat, 30 Nov 2019 19:14:51 +0100 Subject: [PATCH] reworked unsupervised (aligned) embeddings loader method and class (fastText and MUSE). new op.arg -t ['MUSE', 'FastText'] uploaded /results/results.csv (on rcv1 ... run0.pickle) obtained on all available setup. TODO: refactor it also as a standalone class with its own load/weighted sum/extract/reduce methods. --- src/FPEC_andrea.py | 29 +++++++---- src/data/embeddings.py | 32 ++++++++++-- src/learning/learners.py | 14 ++--- src/results/results.csv | 60 +++++++++++++++++++--- src/transformers/StandardizeTransformer.py | 2 +- 5 files changed, 107 insertions(+), 30 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 4decdf6..7092d2b 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -17,11 +17,14 @@ parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') + help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", help="Path to the polylingual word embeddings", default='../embeddings/') +parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, + default='FastText') + parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -36,7 +39,7 @@ def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') -def get_params(dense=False): # TODO kernel function could be useful for meta-classifier +def get_params(dense=False): if not op.optimc: return None c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] @@ -72,30 +75,36 @@ if __name__ == '__main__': # Embeddings and WCE config _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}' + _available_type = ['MUSE', 'FastText'] + assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' + assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' if op.mode_embed == 'none': config = {'unsupervised': False, - 'supervised': False} + 'supervised': False, + 'we_type': None} _config_id = 'None' elif op.mode_embed == 'unsupervised': config = {'unsupervised': True, - 'supervised': False} + 'supervised': False, + 'we_type': op.we_type} _config_id = 'M' elif op.mode_embed == 'supervised': config = {'unsupervised': False, - 'supervised': True} + 'supervised': True, + 'we_type': None} _config_id = 'F' elif op.mode_embed == 'both': config = {'unsupervised': True, - 'supervised': True} + 'supervised': True, + 'we_type': op.we_type} _config_id = 'M_and_F' result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') - classifier = AndreaCLF(op.we_path, - config, + classifier = AndreaCLF(we_path=op.we_path, + config=config, first_tier_learner=get_learner(calibrate=True), meta_learner=get_learner(calibrate=False, kernel='rbf'), first_tier_parameters=get_params(dense=False), @@ -114,5 +123,5 @@ if __name__ == '__main__': metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1], - 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope') + 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 0a7aa4c..0598feb 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -147,7 +147,7 @@ class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' - _name = 'wiki.multi.{}.vec' + _name = '/embeddings/wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) @@ -157,6 +157,30 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) +class EmbeddingsAligned(Vectors): + + def __init__(self, type, path, lang): + + self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' + # todo - rewrite as relative path + self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' + self.path = path + self.name.format(lang) + assert os.path.exists(path), f'pre-trained vectors not found in {path}' + super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) + + def vocabulary(self): + return set(self.stoi.keys()) + + def dim(self): + return self.dim + + def extract(self, words): + source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) + extraction = torch.zeros((len(words), self.dim)) + extraction[source_idx] = self.vectors[target_idx] + return extraction + + class FastTextMUSE(PretrainedEmbeddings): def __init__(self, path, lang, limit=None): @@ -179,12 +203,12 @@ class FastTextMUSE(PretrainedEmbeddings): return extraction -def embedding_matrix(path, voc, lang): +def embedding_matrix(type, path, voc, lang): vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0]) print('[embedding matrix]') - print(f'# [pretrained-matrix: FastTextMUSE {lang}]') - pretrained = FastTextMUSE(path, lang) + print(f'# [pretrained-matrix: {type} {lang}]') + pretrained = EmbeddingsAligned(type, path, lang) P = pretrained.extract(vocabulary).numpy() del pretrained print(f'[embedding matrix done] of shape={P.shape}\n') diff --git a/src/learning/learners.py b/src/learning/learners.py index 5a8f07e..d01c734 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -7,8 +7,6 @@ from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer - -from data.supervised import zscores from transformers.StandardizeTransformer import StandardizeTransformer @@ -444,7 +442,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): first_tier_parameters=None, meta_parameters=None, folded_projections=1, - calmode='cal', n_jobs=-1): + calmode='cal', + n_jobs=-1): super().__init__(first_tier_learner, meta_learner, @@ -479,9 +478,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.languages.append(lang) tfidf_vectorizer.fit(lX[lang]) lX[lang] = tfidf_vectorizer.transform(lX[lang]) - _sort_if_sparse(lX[lang]) self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing + self.lang_tfidf[lang] = tfidf_vectorizer return self # @override std class method @@ -517,15 +515,13 @@ class AndreaCLF(FunnellingPolylingualClassifier): if unsupervised: for lang in languages: - # print('Test building embedding matrix FastTextMuse ...') - _, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang) + _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) self.word_embeddings[lang] = M _r[lang] = lX[lang].dot(M) if supervised: for lang in languages: S = WCE_matrix(lX, ly, lang) - # S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging self.supervised_embeddings[lang] = S if unsupervised: _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) @@ -562,7 +558,7 @@ class AndreaCLF(FunnellingPolylingualClassifier): _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + _vertical_Z = self.standardizer.fit_predict(_vertical_Z) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, diff --git a/src/results/results.csv b/src/results/results.csv index 783225c..dbef7b3 100644 --- a/src/results/results.csv +++ b/src/results/results.csv @@ -1,7 +1,55 @@ id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 it 0.5367684112761455 0.7945344129554656 0.5179685773363333 0.7651326488894972 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 pt 0.6969974938193201 0.878625134264232 0.6967392557377021 0.8466030321042095 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 sv 0.502213941379271 0.7700107543401444 0.4991078326315248 0.7207899075774371 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 es 0.5817849682843411 0.8448214916931778 0.5849433134898768 0.8202407220651875 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 en 0.5284100314545743 0.7625649913344887 0.4968119038332687 0.7152142337789349 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 da 0.4868904596668941 0.7971705872676427 0.4554442856126113 0.741227149968307 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 nl 0.5470546398570723 0.8276762402088773 0.5177281560038681 0.7850292121533595 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 fr 0.4997574965766772 0.7678434382194935 0.4836027981945328 0.7099957841328215 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 de 0.4220457399934653 0.7444316119452236 0.4256936056238835 0.7167749374918141 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 it 0.5398437760931379 0.8008933172994331 0.5146465197929204 0.7584451610463148 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 pt 0.6975279233747671 0.8779959377115775 0.6911573032014029 0.8392738059784555 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 sv 0.5179339368901748 0.7752035065748278 0.4962165022301373 0.7133720895906155 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 es 0.5745246656272296 0.8476464247215235 0.5736797442258523 0.8104027280076678 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 en 0.5265892627601801 0.761854398025736 0.4868823643967914 0.7032312369952987 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 da 0.4857267508065667 0.7955911823647295 0.449682467737542 0.7293013090493592 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 nl 0.5461000743929812 0.8304711580801409 0.5139887576564601 0.7790659402231745 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 fr 0.5015991524998897 0.7699748500677114 0.4811739320459739 0.7065159928392686 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 de 0.4141396160516795 0.743810005053057 0.4126132681585116 0.7023983497130937 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 it 0.4810224709403544 0.7617194410047762 0.453310215598049 0.6999032557458222 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 pt 0.6693663195289151 0.8619702956806105 0.6657298472047529 0.8182397742327547 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 sv 0.43107388787211537 0.7126933954416902 0.4180735239763325 0.6168407376537499 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 es 0.5087201120140917 0.8249322493224932 0.5032299168859704 0.7835086748116167 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 en 0.3822498549987095 0.6877811094452774 0.3309945723997902 0.5962925522774631 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 da 0.4517051377915163 0.7658914728682171 0.4030339299921389 0.6806166833916132 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 nl 0.4875303727964308 0.7853962600178095 0.4534046979963794 0.7270844266398626 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 fr 0.3750315407356979 0.6999393816932714 0.3628389019101708 0.6136670285424017 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 de 0.355059356514748 0.7046466085098807 0.33834564366266284 0.6299245108196094 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 it 0.4755443069888554 0.7675079985780305 0.4501140447119437 0.7023435117413848 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 pt 0.673303227450142 0.8655002733734279 0.6702445967772233 0.8193963705153853 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 sv 0.4189470089118392 0.7236711786068009 0.4198491651634073 0.6314272037990425 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 es 0.5178080058189616 0.8268359020852222 0.5104336022388637 0.782714898784318 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 en 0.4115752894185112 0.7001869158878504 0.35164720517285003 0.6091191993104883 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 da 0.4437869429842064 0.7626499739175796 0.39704879178312197 0.6717100410826179 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 nl 0.47635948919429705 0.7874471399955486 0.4589309165206792 0.7292337019755739 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 fr 0.39374621795002507 0.7063947733122155 0.3850407928528449 0.6315594797194366 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 de 0.3539890425069821 0.7095981751184418 0.3512802070446796 0.6432196317592322 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 it 0.5791455159341481 0.8060849214309596 0.6034752340075125 0.7869853576681214 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 pt 0.6403974389994276 0.8803876562101505 0.6565213830246649 0.8497743924811387 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 sv 0.5032337014290953 0.7768595041322314 0.4719549200388494 0.7364733997369779 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 es 0.5200567247634353 0.8529964145466963 0.4908726477090496 0.8285929531854332 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 en 0.512424485488998 0.7533647963642719 0.4719843960571978 0.7044441169169227 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 da 0.5861231569852233 0.8040595842200032 0.5393761149602847 0.7381233055764151 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 nl 0.6072184716496147 0.8335123523093448 0.5845309357041368 0.8020267337813639 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 fr 0.4923294612439038 0.7854697603651578 0.4713782273939219 0.7329001302478475 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 de 0.4709904181031267 0.7457793804294378 0.4465581491449931 0.7046844416244138 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 it 0.575387626645539 0.8064243448858833 0.5958411838194531 0.7790018114269683 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 pt 0.653004040098633 0.8791937747161628 0.6559210761775208 0.8482450061614855 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 sv 0.49944915222086167 0.7789179104477612 0.4604673876743342 0.727778938054739 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 es 0.5144474487169811 0.8559087767795439 0.48397711649967695 0.8222692824953204 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 en 0.5160737755179508 0.755674709562109 0.45961112517260677 0.6921096138985132 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 da 0.5875776383868945 0.8015873015873016 0.5367286265015276 0.7288571047461061 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 nl 0.6079883230969934 0.8363004776378636 0.5828217771858487 0.7968282071156207 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 fr 0.4966338770370634 0.7860696517412935 0.46250527724325174 0.7292650668002159 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 de 0.4675732669000923 0.7479187479187479 0.43767984457683634 0.69653035770654 nope diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index 381d6c1..45921b7 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -20,4 +20,4 @@ class StandardizeTransformer: return (X - self.mean) / self.std def fit_predict(self, X): - return self.fit(X).predict(X) \ No newline at end of file + return self.fit(X).predict(X)