gfun_multimodal/gfun/vgfs/wceGen.py

80 lines
2.3 KiB
Python

import numpy as np
from joblib import Parallel, delayed
from gfun.vgfs.commons import XdotM, _normalize
from gfun.vgfs.viewGen import ViewGen
class WceGen(ViewGen):
def __init__(self, n_jobs=-1):
print("- init Word-Class-Embeddings View Generating Function")
self.n_jobs = -1
self.sif = True
def fit(self, lX, lY):
print("- fitting Word-Class-Embeddings View Generating Function")
lX = self.vectorizer.transform(lX)
self.langs = sorted(lX.keys())
wce = Parallel(n_jobs=self.n_jobs)(
delayed(wce_matrix)(lX[lang], lY[lang]) for lang in self.langs
)
self.l_wce = {lang: wce[i] for i, lang in enumerate(self.langs)}
return self
def transform(self, lX):
lX = self.vectorizer.transform(lX)
XdotWce = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.l_wce[lang], sif=self.sif)
for lang in self.langs
)
lZ = {l: XdotWce[i] for i, l in enumerate(self.langs)}
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def get_config(self):
return {
"name": "Word-Class Embeddings VGF",
"n_jobs": self.n_jobs,
"sif": self.sif,
}
def save_vgf(self, model_id):
import pickle
from os.path import join
from os import makedirs
vgf_name = "wordClassGen"
_basedir = join("models", "vgfs", "wordclass")
makedirs(_basedir, exist_ok=True)
_path = join(_basedir, f"{vgf_name}_{model_id}.pkl")
with open(_path, "wb") as f:
pickle.dump(self, f)
return self
def wce_matrix(X, Y):
wce = supervised_embeddings_tfidf(X, Y)
wce = zscores(wce, axis=0)
return wce
def supervised_embeddings_tfidf(X, Y):
tfidf_norm = X.sum(axis=0)
tfidf_norm[tfidf_norm == 0] = 1
F = (X.T).dot(Y) / tfidf_norm.T
return np.asarray(F)
def zscores(X, axis=0):
"""
scipy.stats.zscores does not avoid division by 0, which can indeed occur
:param X:
:param axis:
:return:
"""
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
mean = np.mean(X, axis=axis)
return (X - mean) / std