80 lines
2.3 KiB
Python
80 lines
2.3 KiB
Python
import numpy as np
|
|
from joblib import Parallel, delayed
|
|
from gfun.vgfs.commons import XdotM, _normalize
|
|
from gfun.vgfs.viewGen import ViewGen
|
|
|
|
|
|
class WceGen(ViewGen):
|
|
def __init__(self, n_jobs=-1):
|
|
print("- init Word-Class-Embeddings View Generating Function")
|
|
self.n_jobs = -1
|
|
self.sif = True
|
|
|
|
def fit(self, lX, lY):
|
|
print("- fitting Word-Class-Embeddings View Generating Function")
|
|
lX = self.vectorizer.transform(lX)
|
|
self.langs = sorted(lX.keys())
|
|
wce = Parallel(n_jobs=self.n_jobs)(
|
|
delayed(wce_matrix)(lX[lang], lY[lang]) for lang in self.langs
|
|
)
|
|
self.l_wce = {lang: wce[i] for i, lang in enumerate(self.langs)}
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
lX = self.vectorizer.transform(lX)
|
|
XdotWce = Parallel(n_jobs=self.n_jobs)(
|
|
delayed(XdotM)(lX[lang], self.l_wce[lang], sif=self.sif)
|
|
for lang in self.langs
|
|
)
|
|
lZ = {l: XdotWce[i] for i, l in enumerate(self.langs)}
|
|
lZ = _normalize(lZ, l2=True)
|
|
return lZ
|
|
|
|
def fit_transform(self, lX, lY):
|
|
return self.fit(lX, lY).transform(lX)
|
|
|
|
def get_config(self):
|
|
return {
|
|
"name": "Word-Class Embeddings VGF",
|
|
"n_jobs": self.n_jobs,
|
|
"sif": self.sif,
|
|
}
|
|
|
|
def save_vgf(self, model_id):
|
|
import pickle
|
|
from os.path import join
|
|
from os import makedirs
|
|
|
|
vgf_name = "wordClassGen"
|
|
_basedir = join("models", "vgfs", "wordclass")
|
|
makedirs(_basedir, exist_ok=True)
|
|
_path = join(_basedir, f"{vgf_name}_{model_id}.pkl")
|
|
with open(_path, "wb") as f:
|
|
pickle.dump(self, f)
|
|
return self
|
|
|
|
|
|
def wce_matrix(X, Y):
|
|
wce = supervised_embeddings_tfidf(X, Y)
|
|
wce = zscores(wce, axis=0)
|
|
return wce
|
|
|
|
|
|
def supervised_embeddings_tfidf(X, Y):
|
|
tfidf_norm = X.sum(axis=0)
|
|
tfidf_norm[tfidf_norm == 0] = 1
|
|
F = (X.T).dot(Y) / tfidf_norm.T
|
|
return np.asarray(F)
|
|
|
|
|
|
def zscores(X, axis=0):
|
|
"""
|
|
scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
|
:param X:
|
|
:param axis:
|
|
:return:
|
|
"""
|
|
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
|
|
mean = np.mean(X, axis=axis)
|
|
return (X - mean) / std
|