from vgfs.viewGen import ViewGen from vgfs.learners.svms import NaivePolylingualClassifier from vgfs.commons import _normalize class VanillaFunGen(ViewGen): """ View Generator (x): original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 """ def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): """ Init Posterior Probabilities embedder (i.e., VanillaFunGen) :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to return posterior probabilities. :param base_learner: :param n_jobs: integer, number of concurrent workers """ print("- init VanillaFun View Generating Function") self.learners = base_learner self.first_tier_parameters = first_tier_parameters self.n_jobs = n_jobs self.doc_projector = NaivePolylingualClassifier( base_learner=self.learners, parameters=self.first_tier_parameters, n_jobs=self.n_jobs, ) self.vectorizer = None self.load_trained = False def fit(self, lX, lY): if self.load_trained: return self.load_trained() print("- fitting VanillaFun View Generating Function") lX = self.vectorizer.transform(lX) self.doc_projector.fit(lX, lY) return self def transform(self, lX): """ (1) Vectorize documents; (2) Project them according to the learners SVMs; (3) Apply L2 normalization to the projection and returns it. :param lX: dict {lang: indexed documents} :return: document projection to the common latent space. """ lX = self.vectorizer.transform(lX) lZ = self.doc_projector.predict_proba(lX) lZ = _normalize(lZ, l2=True) return lZ def fit_transform(self, lX, lY): return self.fit(lX, lY).transform(lX) def get_config(self): return { "name": "VanillaFunnelling VGF", "base_learner": self.learners, "first_tier_parameters": self.first_tier_parameters, "n_jobs": self.n_jobs, } def save_vgf(self): import pickle from os.path import join from os import makedirs model_id = "TODO" vgf_name = "vanillaFunGen_todo" _basedir = join("models", "vgfs", "posteriors") makedirs(_basedir, exist_ok=True) _path = join(_basedir, f"{vgf_name}.pkl") with open(_path, "wb") as f: pickle.dump(self, f) return self