bulk upload after refactoring
This commit is contained in:
commit
6b75483b55
|
@ -0,0 +1,179 @@
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
!.vscode/*.code-snippets
|
||||||
|
|
||||||
|
# Local History for Visual Studio Code
|
||||||
|
.history/
|
||||||
|
|
||||||
|
# Built Visual Studio Code Extensions
|
||||||
|
*.vsix
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
__pycache__
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
# user defined
|
||||||
|
out/*
|
||||||
|
amazon_cateogories.bu.txt
|
||||||
|
models/*
|
|
@ -0,0 +1,21 @@
|
||||||
|
Appliances
|
||||||
|
Arts Crafts and Sewing
|
||||||
|
Automotive
|
||||||
|
CDs and Vinyl
|
||||||
|
Cell Phones and Accessories
|
||||||
|
Electronics
|
||||||
|
Grocery and Gourmet Food
|
||||||
|
Home and Kitchen
|
||||||
|
Industrial and Scientific
|
||||||
|
Luxury Beauty
|
||||||
|
Magazine Subscriptions
|
||||||
|
Movies and TV
|
||||||
|
Musical Instruments
|
||||||
|
Office Products
|
||||||
|
Patio Lawn and Garden
|
||||||
|
Pet Supplies
|
||||||
|
Software
|
||||||
|
Sports and Outdoors
|
||||||
|
Tools and Home Improvement
|
||||||
|
Toys and Games
|
||||||
|
Video Games
|
|
@ -0,0 +1,370 @@
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from sklearn.preprocessing import MultiLabelBinarizer
|
||||||
|
|
||||||
|
from plotters.distributions import plot_distribution
|
||||||
|
|
||||||
|
# TODO: AmazonDataset should be a instanc of MultimodalDataset
|
||||||
|
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
|
||||||
|
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
|
||||||
|
|
||||||
|
BASEPATH = "/home/moreo/Datasets/raw"
|
||||||
|
with open("dataManager/excluded.csv", "r") as f:
|
||||||
|
EXCLUDED = f.read().splitlines()
|
||||||
|
REGEX = re.compile(r"\s{2,}", re.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
|
def parse(dataset_name, ext="json.gz", nrows=0):
|
||||||
|
dataset_name = dataset_name.replace(" ", "_")
|
||||||
|
meta_path = os.path.join(BASEPATH, f"meta_{dataset_name}.{ext}")
|
||||||
|
path = os.path.join(BASEPATH, f"{dataset_name}.{ext}")
|
||||||
|
|
||||||
|
mapper = {"false": False, "true": True}
|
||||||
|
data = []
|
||||||
|
metadata = []
|
||||||
|
|
||||||
|
_data = gzip.open(path, "r")
|
||||||
|
_metadata = gzip.open(meta_path, "r")
|
||||||
|
for i, (d, m) in enumerate(zip(_data, _metadata)):
|
||||||
|
data.append(eval(d.replace(b"&", b"&"), mapper))
|
||||||
|
metadata.append(eval(m.replace(b"&", b"&"), mapper))
|
||||||
|
if i + 1 == nrows:
|
||||||
|
break
|
||||||
|
|
||||||
|
return data, metadata
|
||||||
|
|
||||||
|
|
||||||
|
def get_categories(data, min_count=0):
|
||||||
|
if data[0].get("category", None) is None:
|
||||||
|
return [], set()
|
||||||
|
|
||||||
|
categories = []
|
||||||
|
for item in data:
|
||||||
|
if item["category"] != "":
|
||||||
|
categories.extend(item["category"])
|
||||||
|
categories = list(filter(lambda x: x not in EXCLUDED, categories))
|
||||||
|
# return categories, sorted(set(categories))
|
||||||
|
return categories, _filter_counter(Counter(categories), min_count)
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_counter(counter, min_count):
|
||||||
|
return {k: v for k, v in counter.items() if v >= min_count}
|
||||||
|
|
||||||
|
|
||||||
|
def get_main_cat(data, min_count=0):
|
||||||
|
if data[0].get("main_cat", None) is None:
|
||||||
|
return [], set()
|
||||||
|
|
||||||
|
main_cats = [item["main_cat"] for item in data if item["main_cat"] != ""]
|
||||||
|
main_cats = list(filter(lambda x: x not in EXCLUDED, main_cats))
|
||||||
|
# return main_cats, sorted(set(main_cats))
|
||||||
|
return main_cats, _filter_counter(Counter(main_cats), min_count)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_sample_with_images(metadata):
|
||||||
|
# TODO: check whether images are really available and store them locally
|
||||||
|
# print(f"(Pre-filter) Total items: {len(metadata)}")
|
||||||
|
data = []
|
||||||
|
for i, m in enumerate(metadata):
|
||||||
|
if "imageURL" not in m.keys():
|
||||||
|
continue
|
||||||
|
if len(m["imageURL"]) != 0 or len(m["imageURLHighRes"]) != 0:
|
||||||
|
data.append(m)
|
||||||
|
# print(f"(Post-filter) Total items: {len(data)}")
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def select_description(descriptions):
|
||||||
|
"""
|
||||||
|
Some items have multiple descriptions (len(item["description"]) > 1).
|
||||||
|
Most of these descriptions are just empty strings. Some items instead actually have
|
||||||
|
multiple strings describing them
|
||||||
|
At the moment, we rely on a simple heuristic: select the longest string and use it
|
||||||
|
the only description.
|
||||||
|
"""
|
||||||
|
if len(descriptions) == 0:
|
||||||
|
return [""]
|
||||||
|
return [max(descriptions, key=len)]
|
||||||
|
|
||||||
|
|
||||||
|
def build_product_json(metadata, binarizer):
|
||||||
|
data = []
|
||||||
|
for item in metadata:
|
||||||
|
if len(item["description"]) != 1:
|
||||||
|
item["description"] = select_description(item["description"])
|
||||||
|
|
||||||
|
product = {
|
||||||
|
"asin": item["asin"],
|
||||||
|
"title": item["title"],
|
||||||
|
"description": item["description"],
|
||||||
|
# TODO: some items have multiple descriptions (len(item["description"]) > 1))
|
||||||
|
"cleaned_description": clean_description(
|
||||||
|
BeautifulSoup(
|
||||||
|
item["title"] + ". " + item["description"][0],
|
||||||
|
features="html.parser",
|
||||||
|
).text
|
||||||
|
),
|
||||||
|
# TODO: is it faster to call transform on the whole dataset?
|
||||||
|
"main_category": item["main_cat"],
|
||||||
|
"categories": item["category"],
|
||||||
|
"all_categories": _get_cats(item["main_cat"], item["category"]),
|
||||||
|
"vect_categories": binarizer.transform(
|
||||||
|
[_get_cats(item["main_cat"], item["category"])]
|
||||||
|
)[0],
|
||||||
|
}
|
||||||
|
data.append(product)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _get_cats(main_cat, cats):
|
||||||
|
return [main_cat] + cats
|
||||||
|
|
||||||
|
|
||||||
|
def get_label_binarizer(cats):
|
||||||
|
mlb = MultiLabelBinarizer()
|
||||||
|
mlb.fit([cats])
|
||||||
|
return mlb
|
||||||
|
|
||||||
|
|
||||||
|
def clean_description(description):
|
||||||
|
description = re.sub(REGEX, " ", description)
|
||||||
|
description = description.rstrip()
|
||||||
|
description = description.replace("\t", "")
|
||||||
|
description = description.replace("\n", " ")
|
||||||
|
return description
|
||||||
|
|
||||||
|
|
||||||
|
def construct_target_matrix(data):
|
||||||
|
return np.stack([d["vect_categories"] for d in data], axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_classes(counter_cats, counter_sub_cats):
|
||||||
|
if len(counter_cats) == 0:
|
||||||
|
return counter_sub_cats.keys()
|
||||||
|
elif len(counter_sub_cats) == 0:
|
||||||
|
return counter_cats.keys()
|
||||||
|
else:
|
||||||
|
return list(counter_cats.keys()) + list(counter_sub_cats.keys())
|
||||||
|
|
||||||
|
|
||||||
|
class AmazonDataset:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
domains=["Appliances", "Automotive", "Movies and TV"],
|
||||||
|
basepath="/home/moreo/Datasets/raw",
|
||||||
|
min_count=10,
|
||||||
|
max_labels=50,
|
||||||
|
nrows=1000,
|
||||||
|
):
|
||||||
|
print(f"[Init AmazonDataset]")
|
||||||
|
print(f"- Domains: {domains}")
|
||||||
|
self.REGEX = re.compile(r"\s{2,}", re.MULTILINE)
|
||||||
|
with open("dataManager/excluded.csv", "r") as f:
|
||||||
|
self.EXCLUDED = f.read().splitlines()
|
||||||
|
self.basepath = basepath
|
||||||
|
self.domains = self.parse_domains(domains)
|
||||||
|
self.nrows = nrows
|
||||||
|
self.min_count = min_count
|
||||||
|
self.max_labels = max_labels
|
||||||
|
self.len_data = 0
|
||||||
|
self.domain_data = self.load_data()
|
||||||
|
self.labels, self.domain_labels = self.get_all_cats()
|
||||||
|
self.label_binarizer = get_label_binarizer(self.labels)
|
||||||
|
self.vectorized_labels = self.vecorize_labels()
|
||||||
|
self.dX = self.construct_data_matrix()
|
||||||
|
self.dY = self.construct_target_matrix()
|
||||||
|
self.langs = ["en"]
|
||||||
|
|
||||||
|
def parse_domains(self, domains):
|
||||||
|
with open("amazon_categories.txt", "r") as f:
|
||||||
|
all_domains = f.read().splitlines()
|
||||||
|
if domains == "all":
|
||||||
|
return all_domains
|
||||||
|
else:
|
||||||
|
assert all([d in all_domains for d in domains]), "Invalid domain name"
|
||||||
|
return domains
|
||||||
|
|
||||||
|
def parse(self, dataset_name, nrows, ext="json.gz"):
|
||||||
|
dataset_name = dataset_name.replace(" ", "_")
|
||||||
|
meta_path = os.path.join(self.basepath, f"meta_{dataset_name}.{ext}")
|
||||||
|
path = os.path.join(self.basepath, f"{dataset_name}.{ext}")
|
||||||
|
|
||||||
|
mapper = {"false": False, "true": True}
|
||||||
|
data = []
|
||||||
|
metadata = []
|
||||||
|
|
||||||
|
_data = gzip.open(path, "r")
|
||||||
|
_metadata = gzip.open(meta_path, "r")
|
||||||
|
for i, (d, m) in enumerate(zip(_data, _metadata)):
|
||||||
|
data.append(eval(d.replace(b"&", b"&"), mapper))
|
||||||
|
metadata.append(eval(m.replace(b"&", b"&"), mapper))
|
||||||
|
if i + 1 == nrows:
|
||||||
|
break
|
||||||
|
|
||||||
|
return data, metadata
|
||||||
|
|
||||||
|
def load_data(self):
|
||||||
|
print(f"- Loading up to {self.nrows} items per domain")
|
||||||
|
domain_data = {}
|
||||||
|
for domain in self.domains:
|
||||||
|
_, metadata = self.parse(domain, nrows=self.nrows)
|
||||||
|
metadata = filter_sample_with_images(metadata)
|
||||||
|
domain_data[domain] = self.build_product_scheme(metadata)
|
||||||
|
self.len_data += len(metadata)
|
||||||
|
print(f"- Loaded {self.len_data} items")
|
||||||
|
return domain_data
|
||||||
|
|
||||||
|
def get_all_cats(self):
|
||||||
|
assert len(self.domain_data) != 0, "Load data first"
|
||||||
|
labels = set()
|
||||||
|
domain_labels = {}
|
||||||
|
for domain, data in self.domain_data.items():
|
||||||
|
_, counter_cats = self._get_counter_cats(data, self.min_count)
|
||||||
|
labels.update(counter_cats.keys())
|
||||||
|
domain_labels[domain] = counter_cats
|
||||||
|
print(f"- Found {len(labels)} labels")
|
||||||
|
return labels, domain_labels
|
||||||
|
|
||||||
|
def export_to_torch(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_label_binarizer(self):
|
||||||
|
mlb = MultiLabelBinarizer()
|
||||||
|
mlb.fit([self.labels])
|
||||||
|
return mlb
|
||||||
|
|
||||||
|
def vecorize_labels(self):
|
||||||
|
for domain, data in self.domain_data.items():
|
||||||
|
for item in data:
|
||||||
|
item["vect_categories"] = self.label_binarizer.transform(
|
||||||
|
[item["all_categories"]]
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
def build_product_scheme(self, metadata):
|
||||||
|
data = []
|
||||||
|
for item in metadata:
|
||||||
|
if len(item["description"]) != 1:
|
||||||
|
_desc = self._select_description(item["description"])
|
||||||
|
else:
|
||||||
|
_desc = item["description"][0]
|
||||||
|
|
||||||
|
product = {
|
||||||
|
"asin": item["asin"],
|
||||||
|
"title": item["title"],
|
||||||
|
"description": _desc,
|
||||||
|
# TODO: some items have multiple descriptions (len(item["description"]) > 1))
|
||||||
|
"cleaned_text": self._clean_description(
|
||||||
|
BeautifulSoup(
|
||||||
|
item["title"] + ". " + _desc,
|
||||||
|
features="html.parser",
|
||||||
|
).text
|
||||||
|
),
|
||||||
|
# TODO: is it faster to call transform on the whole dataset?
|
||||||
|
"main_category": item["main_cat"],
|
||||||
|
"categories": item["category"],
|
||||||
|
"all_categories": self._get_cats(item["main_cat"], item["category"]),
|
||||||
|
# "vect_categories": binarizer.transform(
|
||||||
|
# [_get_cats(item["main_cat"], item["category"])]
|
||||||
|
# )[0],
|
||||||
|
}
|
||||||
|
data.append(product)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def construct_data_matrix(self):
|
||||||
|
dX = {}
|
||||||
|
for domain, data in self.domain_data.items():
|
||||||
|
dX[domain] = [d["cleaned_text"] for d in data]
|
||||||
|
return dX
|
||||||
|
|
||||||
|
def construct_target_matrix(self):
|
||||||
|
dY = {}
|
||||||
|
for domain, data in self.domain_data.items():
|
||||||
|
dY[domain] = np.stack([d["vect_categories"] for d in data], axis=0)
|
||||||
|
return dY
|
||||||
|
|
||||||
|
def get_overall_label_matrix(self):
|
||||||
|
assert hasattr(self, "label_matrices"), "Init label matrices first"
|
||||||
|
return np.vstack([x for x in self.dY.values()])
|
||||||
|
|
||||||
|
def _get_counter_cats(self, data, min_count):
|
||||||
|
cats = []
|
||||||
|
for item in data:
|
||||||
|
cats.extend(item["all_categories"])
|
||||||
|
cats = list(filter(lambda x: x not in self.EXCLUDED, cats))
|
||||||
|
return cats, self._filter_counter(Counter(cats), min_count)
|
||||||
|
|
||||||
|
def _filter_counter(self, counter, min_count):
|
||||||
|
return {k: v for k, v in counter.items() if v >= min_count}
|
||||||
|
|
||||||
|
def _clean_description(self, description):
|
||||||
|
description = re.sub(self.REGEX, " ", description)
|
||||||
|
description = description.rstrip()
|
||||||
|
description = description.replace("\t", "")
|
||||||
|
description = description.replace("\n", " ")
|
||||||
|
return description
|
||||||
|
|
||||||
|
def _get_cats(self, main_cat, cats):
|
||||||
|
return [main_cat] + cats
|
||||||
|
|
||||||
|
def _select_description(self, descriptions) -> str:
|
||||||
|
"""
|
||||||
|
Some items have multiple descriptions (len(item["description"]) > 1).
|
||||||
|
Most of these descriptions are just empty strings. Some items instead actually have
|
||||||
|
multiple strings describing them
|
||||||
|
At the moment, we rely on a simple heuristic: select the longest string and use it
|
||||||
|
the only description.
|
||||||
|
"""
|
||||||
|
if len(descriptions) == 0:
|
||||||
|
return ""
|
||||||
|
return max(descriptions, key=len)
|
||||||
|
|
||||||
|
def plot_label_distribution(self):
|
||||||
|
overall_mat = self.get_overall_label_matrix()
|
||||||
|
plot_distribution(
|
||||||
|
np.arange(len(self.labels)),
|
||||||
|
np.sum(overall_mat, axis=0),
|
||||||
|
title="Amazon Dataset",
|
||||||
|
labels=self.labels,
|
||||||
|
notes=overall_mat.shape,
|
||||||
|
max_labels=args.max_labels,
|
||||||
|
figsize=(10, 10),
|
||||||
|
save=True,
|
||||||
|
path="out",
|
||||||
|
)
|
||||||
|
|
||||||
|
def plot_per_domain_label_distribution(self):
|
||||||
|
for domain, matrix in self.vecorize_labels:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
dataset = AmazonDataset(
|
||||||
|
domains=args.domains,
|
||||||
|
nrows=args.nrows,
|
||||||
|
min_count=args.min_count,
|
||||||
|
max_labels=args.max_labels,
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset.plot_label_distribution()
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append("/home/andreapdr/devel/gFunMultiModal/")
|
||||||
|
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--domains", type=str, default="all")
|
||||||
|
parser.add_argument("--nrows", type=int, default=10000)
|
||||||
|
parser.add_argument("--min_count", type=int, default=10)
|
||||||
|
parser.add_argument("--max_labels", type=int, default=50)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
|
@ -0,0 +1,27 @@
|
||||||
|
</span></span></span>
|
||||||
|
</span></span></span>
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/handmade/brand/logos/2018/subnav_logo._CB502360610_.png" class="nav-categ-image" alt="Handmade"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/handmade/brand/logos/2018/subnav_logo._CB502360610_.png" class="nav-categ-image" alt="Handmade"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||||
|
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
||||||
|
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
Can't render this file because it contains an unexpected character in line 3 and column 10.
|
|
@ -0,0 +1,142 @@
|
||||||
|
import re
|
||||||
|
from os import listdir
|
||||||
|
from os.path import isdir, join
|
||||||
|
|
||||||
|
from dataManager.torchDataset import TorchMultiNewsDataset
|
||||||
|
|
||||||
|
# TODO: labels must be aligned between languages
|
||||||
|
# TODO: remove copyright and also tags (doc.split("More about:")[0])
|
||||||
|
# TODO: define fn to represent the dataset as a torch Dataset
|
||||||
|
# TODO: this should be a instance of a abstract MultimodalMultilingualDataset
|
||||||
|
|
||||||
|
|
||||||
|
class MultiNewsDataset:
|
||||||
|
def __init__(self, data_dir, excluded_langs=[], debug=False):
|
||||||
|
self.debug = debug
|
||||||
|
self.data_dir = data_dir
|
||||||
|
self.langs = self.get_langs()
|
||||||
|
self.excluded_langs = excluded_langs
|
||||||
|
self.lang_multiModalDataset = {}
|
||||||
|
print(
|
||||||
|
f"[{'DEBUG MODE: ' if debug else ''}Loaded MultiNewsDataset - langs: {self.langs}]"
|
||||||
|
)
|
||||||
|
self.load_data()
|
||||||
|
self.print_stats()
|
||||||
|
|
||||||
|
def load_data(self):
|
||||||
|
for lang in self.langs:
|
||||||
|
if lang not in self.excluded_langs:
|
||||||
|
self.lang_multiModalDataset[lang] = MultiModalDataset(
|
||||||
|
lang, join(self.data_dir, lang)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_langs(self):
|
||||||
|
from os import listdir
|
||||||
|
|
||||||
|
if self.debug:
|
||||||
|
return ["it", "en"]
|
||||||
|
|
||||||
|
return tuple(sorted([folder for folder in listdir(self.data_dir)]))
|
||||||
|
|
||||||
|
def print_stats(self):
|
||||||
|
print(f"[MultiNewsDataset stats]")
|
||||||
|
# print(f" - langs: {self.langs}")
|
||||||
|
total_docs = 0
|
||||||
|
for lang in self.langs:
|
||||||
|
_len = len(self.lang_multiModalDataset[lang].data)
|
||||||
|
total_docs += _len
|
||||||
|
print(
|
||||||
|
f" - {lang} docs: {_len}\t- labels: {self._count_lang_labels(self.lang_multiModalDataset[lang].data)}"
|
||||||
|
)
|
||||||
|
print(f" - total docs: {total_docs}")
|
||||||
|
|
||||||
|
def _count_lang_labels(self, data):
|
||||||
|
lang_labels = set()
|
||||||
|
for sample in data:
|
||||||
|
lang_labels.update(sample[-1])
|
||||||
|
return len(lang_labels)
|
||||||
|
|
||||||
|
def export_to_torch_dataset(self, tokenizer_id):
|
||||||
|
raise NotImplementedError
|
||||||
|
# torch_datasets = []
|
||||||
|
# for lang, multimodal_dataset in self.lang_multiModalDataset.keys():
|
||||||
|
# dataset = TorchMultiNewsDataset(
|
||||||
|
# lang=lang,
|
||||||
|
# data=multimodal_dataset.get_docs(),
|
||||||
|
# ids=multimodal_dataset.get_ids(),
|
||||||
|
# imgs=multimodal_dataset.get_imgs(),
|
||||||
|
# labels=multimodal_dataset.get_labels(),
|
||||||
|
# tokenizer_id=tokenizer_id,
|
||||||
|
# )
|
||||||
|
# torch_datasets.append(dataset)
|
||||||
|
|
||||||
|
# raise NotImplementedError
|
||||||
|
|
||||||
|
def save_to_disk(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class MultiModalDataset:
|
||||||
|
def __init__(self, lang, data_dir):
|
||||||
|
self.lang = lang
|
||||||
|
self.data_dir = data_dir
|
||||||
|
self.re_labels = re.compile(r"<a rel=\"tag\" href=\"\/tag\/.+?\/\">(.+?)<\/a>")
|
||||||
|
self.re_cleaner = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
|
||||||
|
self.re_white = re.compile(r" +")
|
||||||
|
self.data = self.get_docs()
|
||||||
|
|
||||||
|
def get_docs(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_imgs(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_labels(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_ids(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_docs(self):
|
||||||
|
data = []
|
||||||
|
news_folder = [doc_folder for doc_folder in listdir(self.data_dir)]
|
||||||
|
for news_folder in news_folder:
|
||||||
|
if isdir(join(self.data_dir, news_folder)):
|
||||||
|
fname_doc = f"text.{news_folder.split('.')[-1]}"
|
||||||
|
with open(join(self.data_dir, news_folder, fname_doc)) as f:
|
||||||
|
html_doc = f.read()
|
||||||
|
img = self.get_image()
|
||||||
|
clean_doc, labels = self.preprocess_html(html_doc)
|
||||||
|
data.append((fname_doc, clean_doc, html_doc, img, labels))
|
||||||
|
return data
|
||||||
|
|
||||||
|
def preprocess_html(self, html_doc):
|
||||||
|
labels = self._extract_labels(html_doc)
|
||||||
|
cleaned = self._clean_up_str(self._remove_html_tags(html_doc))
|
||||||
|
return cleaned, labels
|
||||||
|
|
||||||
|
def _extract_labels(self, data):
|
||||||
|
return re.findall(self.re_labels, data)
|
||||||
|
|
||||||
|
def _remove_html_tags(self, data):
|
||||||
|
cleaned = re.sub(self.re_cleaner, "", data)
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
def _clean_up_str(self, doc):
|
||||||
|
doc = re.sub(self.re_white, " ", doc)
|
||||||
|
doc = doc.lstrip()
|
||||||
|
doc = doc.rstrip()
|
||||||
|
doc = doc.replace("\n", " ")
|
||||||
|
doc = doc.replace("\t", " ")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def get_image(self):
|
||||||
|
# TODO: implement
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from os.path import expanduser
|
||||||
|
|
||||||
|
_dataset_path_hardcoded = "~/datasets/MultiNews/20110730/"
|
||||||
|
dataset = MultiNewsDataset(expanduser(_dataset_path_hardcoded), debug=True)
|
|
@ -0,0 +1,270 @@
|
||||||
|
# TODO: this should be a instance of an abstract MultilingualDataset
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from scipy.sparse import issparse
|
||||||
|
from os.path import join, expanduser
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
class NewMultilingualDataset(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def get_training(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_validation(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_test(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def mask_numbers(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def save(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# class RcvMultilingualDataset(MultilingualDataset):
|
||||||
|
class RcvMultilingualDataset:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
run="0",
|
||||||
|
):
|
||||||
|
self.dataset_name = "rcv1-2"
|
||||||
|
self.dataset_path = expanduser(
|
||||||
|
f"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run{run}.pickle"
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
data = pickle.load(open(self.dataset_path, "rb"))
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class MultilingualDataset:
|
||||||
|
"""
|
||||||
|
A multilingual dataset is a dictionary of training and test documents indexed by language code.
|
||||||
|
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
|
||||||
|
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
|
||||||
|
labels of each document, and ids is a list of document-identifiers from the original collection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, dataset_name):
|
||||||
|
self.dataset_name = dataset_name
|
||||||
|
self.multiling_dataset = {}
|
||||||
|
print(f"[Init Multilingual Dataset: {self.dataset_name}]")
|
||||||
|
|
||||||
|
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
|
||||||
|
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
|
||||||
|
|
||||||
|
def save(self, file):
|
||||||
|
self.sort_indexes()
|
||||||
|
pickle.dump(self, open(file, "wb"), pickle.HIGHEST_PROTOCOL)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
if item in self.langs():
|
||||||
|
return self.multiling_dataset[item]
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, file):
|
||||||
|
data = pickle.load(open(file, "rb"))
|
||||||
|
data.sort_indexes()
|
||||||
|
return data
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_ids(cls, file):
|
||||||
|
data = pickle.load(open(file, "rb"))
|
||||||
|
tr_ids = {
|
||||||
|
lang: tr_ids
|
||||||
|
for (lang, ((_, _, tr_ids), (_, _, _))) in data.multiling_dataset.items()
|
||||||
|
}
|
||||||
|
te_ids = {
|
||||||
|
lang: te_ids
|
||||||
|
for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()
|
||||||
|
}
|
||||||
|
return tr_ids, te_ids
|
||||||
|
|
||||||
|
def sort_indexes(self):
|
||||||
|
for lang, ((Xtr, _, _), (Xte, _, _)) in self.multiling_dataset.items():
|
||||||
|
if issparse(Xtr):
|
||||||
|
Xtr.sort_indices()
|
||||||
|
if issparse(Xte):
|
||||||
|
Xte.sort_indices()
|
||||||
|
|
||||||
|
def set_view(self, categories=None, languages=None):
|
||||||
|
if categories is not None:
|
||||||
|
if isinstance(categories, int):
|
||||||
|
categories = np.array([categories])
|
||||||
|
elif isinstance(categories, list):
|
||||||
|
categories = np.array(categories)
|
||||||
|
self.categories_view = categories
|
||||||
|
if languages is not None:
|
||||||
|
self.languages_view = languages
|
||||||
|
|
||||||
|
def training(self, mask_numbers=False, target_as_csr=False):
|
||||||
|
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
|
||||||
|
|
||||||
|
def test(self, mask_numbers=False, target_as_csr=False):
|
||||||
|
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
|
||||||
|
|
||||||
|
def lXtr(self, mask_numbers=False):
|
||||||
|
proc = lambda x: _mask_numbers(x) if mask_numbers else x
|
||||||
|
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||||
|
return {
|
||||||
|
lang: proc(Xtr)
|
||||||
|
for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items()
|
||||||
|
if lang in self.langs()
|
||||||
|
}
|
||||||
|
|
||||||
|
def lXte(self, mask_numbers=False):
|
||||||
|
proc = lambda x: _mask_numbers(x) if mask_numbers else x
|
||||||
|
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||||
|
return {
|
||||||
|
lang: proc(Xte)
|
||||||
|
for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items()
|
||||||
|
if lang in self.langs()
|
||||||
|
}
|
||||||
|
|
||||||
|
def lYtr(self, as_csr=False):
|
||||||
|
lY = {
|
||||||
|
lang: self.cat_view(Ytr)
|
||||||
|
for (lang, ((_, Ytr, _), _)) in self.multiling_dataset.items()
|
||||||
|
if lang in self.langs()
|
||||||
|
}
|
||||||
|
if as_csr:
|
||||||
|
lY = {l: csr_matrix(Y) for l, Y in lY.items()}
|
||||||
|
return lY
|
||||||
|
|
||||||
|
def lYte(self, as_csr=False):
|
||||||
|
lY = {
|
||||||
|
lang: self.cat_view(Yte)
|
||||||
|
for (lang, (_, (_, Yte, _))) in self.multiling_dataset.items()
|
||||||
|
if lang in self.langs()
|
||||||
|
}
|
||||||
|
if as_csr:
|
||||||
|
lY = {l: csr_matrix(Y) for l, Y in lY.items()}
|
||||||
|
return lY
|
||||||
|
|
||||||
|
def cat_view(self, Y):
|
||||||
|
if hasattr(self, "categories_view"):
|
||||||
|
return Y[:, self.categories_view]
|
||||||
|
else:
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def langs(self):
|
||||||
|
if hasattr(self, "languages_view"):
|
||||||
|
langs = self.languages_view
|
||||||
|
else:
|
||||||
|
langs = sorted(self.multiling_dataset.keys())
|
||||||
|
return langs
|
||||||
|
|
||||||
|
def num_categories(self):
|
||||||
|
return self.lYtr()[self.langs()[0]].shape[1]
|
||||||
|
|
||||||
|
def show_dimensions(self):
|
||||||
|
def shape(X):
|
||||||
|
return X.shape if hasattr(X, "shape") else len(X)
|
||||||
|
|
||||||
|
for lang, (
|
||||||
|
(Xtr, Ytr, IDtr),
|
||||||
|
(Xte, Yte, IDte),
|
||||||
|
) in self.multiling_dataset.items():
|
||||||
|
if lang not in self.langs():
|
||||||
|
continue
|
||||||
|
print(
|
||||||
|
"Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(
|
||||||
|
lang,
|
||||||
|
shape(Xtr),
|
||||||
|
self.cat_view(Ytr).shape,
|
||||||
|
shape(Xte),
|
||||||
|
self.cat_view(Yte).shape,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def show_category_prevalences(self):
|
||||||
|
nC = self.num_categories()
|
||||||
|
accum_tr = np.zeros(nC, dtype=np.int)
|
||||||
|
accum_te = np.zeros(nC, dtype=np.int)
|
||||||
|
in_langs = np.zeros(
|
||||||
|
nC, dtype=np.int
|
||||||
|
) # count languages with at least one positive example (per category)
|
||||||
|
for lang, (
|
||||||
|
(Xtr, Ytr, IDtr),
|
||||||
|
(Xte, Yte, IDte),
|
||||||
|
) in self.multiling_dataset.items():
|
||||||
|
if lang not in self.langs():
|
||||||
|
continue
|
||||||
|
prev_train = np.sum(self.cat_view(Ytr), axis=0)
|
||||||
|
prev_test = np.sum(self.cat_view(Yte), axis=0)
|
||||||
|
accum_tr += prev_train
|
||||||
|
accum_te += prev_test
|
||||||
|
in_langs += (prev_train > 0) * 1
|
||||||
|
print(lang + "-train", prev_train)
|
||||||
|
print(lang + "-test", prev_test)
|
||||||
|
print("all-train", accum_tr)
|
||||||
|
print("all-test", accum_te)
|
||||||
|
|
||||||
|
return accum_tr, accum_te, in_langs
|
||||||
|
|
||||||
|
def set_labels(self, labels):
|
||||||
|
self.labels = labels
|
||||||
|
|
||||||
|
def reduce_data(self, langs=["it", "en"], maxn=50):
|
||||||
|
print(f"- Reducing data: {langs} with max {maxn} documents...")
|
||||||
|
self.set_view(languages=langs)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
lang: self._reduce(data, maxn)
|
||||||
|
for lang, data in self.multiling_dataset.items()
|
||||||
|
if lang in langs
|
||||||
|
}
|
||||||
|
self.multiling_dataset = data
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _reduce(self, multilingual_dataset, maxn):
|
||||||
|
new_data = []
|
||||||
|
for split in multilingual_dataset:
|
||||||
|
docs, labels, ids = split
|
||||||
|
new_data.append((docs[:maxn], labels[:maxn], ids[:maxn]))
|
||||||
|
return new_data
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_numbers(data):
|
||||||
|
mask_moredigit = re.compile(r"\s[\+-]?\d{5,}([\.,]\d*)*\b")
|
||||||
|
mask_4digit = re.compile(r"\s[\+-]?\d{4}([\.,]\d*)*\b")
|
||||||
|
mask_3digit = re.compile(r"\s[\+-]?\d{3}([\.,]\d*)*\b")
|
||||||
|
mask_2digit = re.compile(r"\s[\+-]?\d{2}([\.,]\d*)*\b")
|
||||||
|
mask_1digit = re.compile(r"\s[\+-]?\d{1}([\.,]\d*)*\b")
|
||||||
|
masked = []
|
||||||
|
for text in tqdm(data, desc="masking numbers"):
|
||||||
|
text = " " + text
|
||||||
|
text = mask_moredigit.sub(" MoreDigitMask", text)
|
||||||
|
text = mask_4digit.sub(" FourDigitMask", text)
|
||||||
|
text = mask_3digit.sub(" ThreeDigitMask", text)
|
||||||
|
text = mask_2digit.sub(" TwoDigitMask", text)
|
||||||
|
text = mask_1digit.sub(" OneDigitMask", text)
|
||||||
|
masked.append(text.replace(".", "").replace(",", "").strip())
|
||||||
|
return masked
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
DATAPATH = expanduser(
|
||||||
|
"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle"
|
||||||
|
)
|
||||||
|
print(DATAPATH)
|
||||||
|
dataset = MultilingualDataset().load(DATAPATH)
|
||||||
|
print(dataset.show_dimensions())
|
|
@ -0,0 +1,2 @@
|
||||||
|
class TorchMultiNewsDataset:
|
||||||
|
pass
|
|
@ -0,0 +1,40 @@
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
|
||||||
|
from evaluation.metrics import *
|
||||||
|
|
||||||
|
|
||||||
|
def evaluation_metrics(y, y_):
|
||||||
|
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
|
||||||
|
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||||
|
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||||
|
# return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_), macroP(y, y_), microP(y, y_), macroR(y, y_), microR(y, y_)
|
||||||
|
# return macroF1(y, y_), microF1(y, y_), macroAcc(y, y_), microAcc(y, y_), macroP(y, y_), microP(y, y_), macroR(y, y_), microR(y, y_), macroAcc(y, y_)
|
||||||
|
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
||||||
|
if n_jobs == 1:
|
||||||
|
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
|
||||||
|
else:
|
||||||
|
langs = list(ly_true.keys())
|
||||||
|
evals = Parallel(n_jobs=n_jobs)(
|
||||||
|
delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs
|
||||||
|
)
|
||||||
|
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||||
|
|
||||||
|
|
||||||
|
def log_eval(l_eval, phase="training"):
|
||||||
|
print(f"\n[Results {phase}]")
|
||||||
|
metrics = []
|
||||||
|
for lang in l_eval.keys():
|
||||||
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
|
if phase != "validation":
|
||||||
|
print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}")
|
||||||
|
averages = np.mean(np.array(metrics), axis=0)
|
||||||
|
print(
|
||||||
|
"Averages: MF1, mF1, MK, mK",
|
||||||
|
np.round(averages, 3),
|
||||||
|
"\n",
|
||||||
|
)
|
||||||
|
return averages
|
|
@ -0,0 +1,237 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class ContTable:
|
||||||
|
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||||
|
self.tp = tp
|
||||||
|
self.tn = tn
|
||||||
|
self.fp = fp
|
||||||
|
self.fn = fn
|
||||||
|
|
||||||
|
def get_d(self):
|
||||||
|
return self.tp + self.tn + self.fp + self.fn
|
||||||
|
|
||||||
|
def get_c(self):
|
||||||
|
return self.tp + self.fn
|
||||||
|
|
||||||
|
def get_not_c(self):
|
||||||
|
return self.tn + self.fp
|
||||||
|
|
||||||
|
def get_f(self):
|
||||||
|
return self.tp + self.fp
|
||||||
|
|
||||||
|
def get_not_f(self):
|
||||||
|
return self.tn + self.fn
|
||||||
|
|
||||||
|
def p_c(self):
|
||||||
|
return (1.0 * self.get_c()) / self.get_d()
|
||||||
|
|
||||||
|
def p_not_c(self):
|
||||||
|
return 1.0 - self.p_c()
|
||||||
|
|
||||||
|
def p_f(self):
|
||||||
|
return (1.0 * self.get_f()) / self.get_d()
|
||||||
|
|
||||||
|
def p_not_f(self):
|
||||||
|
return 1.0 - self.p_f()
|
||||||
|
|
||||||
|
def p_tp(self):
|
||||||
|
return (1.0 * self.tp) / self.get_d()
|
||||||
|
|
||||||
|
def p_tn(self):
|
||||||
|
return (1.0 * self.tn) / self.get_d()
|
||||||
|
|
||||||
|
def p_fp(self):
|
||||||
|
return (1.0 * self.fp) / self.get_d()
|
||||||
|
|
||||||
|
def p_fn(self):
|
||||||
|
return (1.0 * self.fn) / self.get_d()
|
||||||
|
|
||||||
|
def tpr(self):
|
||||||
|
c = 1.0 * self.get_c()
|
||||||
|
return self.tp / c if c > 0.0 else 0.0
|
||||||
|
|
||||||
|
def fpr(self):
|
||||||
|
_c = 1.0 * self.get_not_c()
|
||||||
|
return self.fp / _c if _c > 0.0 else 0.0
|
||||||
|
|
||||||
|
def __add__(self, other):
|
||||||
|
return ContTable(
|
||||||
|
tp=self.tp + other.tp,
|
||||||
|
tn=self.tn + other.tn,
|
||||||
|
fp=self.fp + other.fp,
|
||||||
|
fn=self.fn + other.fn,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def accuracy(cell):
|
||||||
|
return (cell.tp + cell.tn) * 1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
|
||||||
|
|
||||||
|
|
||||||
|
def precision(cell):
|
||||||
|
num = cell.tp
|
||||||
|
den = cell.tp + cell.fp
|
||||||
|
if den > 0:
|
||||||
|
return num / den
|
||||||
|
return 1.0
|
||||||
|
num = cell.tn
|
||||||
|
den = cell.tn + cell.fn
|
||||||
|
return num / den
|
||||||
|
|
||||||
|
|
||||||
|
def recall(cell):
|
||||||
|
num = cell.tp
|
||||||
|
den = cell.tp + cell.fn
|
||||||
|
if den > 0:
|
||||||
|
return num / den
|
||||||
|
return 1.0
|
||||||
|
num = cell.tn
|
||||||
|
den = cell.tn + cell.fp
|
||||||
|
return num / den
|
||||||
|
|
||||||
|
|
||||||
|
def f1(cell):
|
||||||
|
num = 2.0 * cell.tp
|
||||||
|
den = 2.0 * cell.tp + cell.fp + cell.fn
|
||||||
|
if den > 0:
|
||||||
|
return num / den
|
||||||
|
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def K(cell):
|
||||||
|
specificity, recall = 0.0, 0.0
|
||||||
|
|
||||||
|
AN = cell.tn + cell.fp
|
||||||
|
if AN != 0:
|
||||||
|
specificity = cell.tn * 1.0 / AN
|
||||||
|
|
||||||
|
AP = cell.tp + cell.fn
|
||||||
|
if AP != 0:
|
||||||
|
recall = cell.tp * 1.0 / AP
|
||||||
|
|
||||||
|
if AP == 0:
|
||||||
|
return 2.0 * specificity - 1.0
|
||||||
|
elif AN == 0:
|
||||||
|
return 2.0 * recall - 1.0
|
||||||
|
else:
|
||||||
|
return specificity + recall - 1.0
|
||||||
|
|
||||||
|
|
||||||
|
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
|
||||||
|
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
|
||||||
|
def __check_consistency_and_adapt(true_labels, predictions):
|
||||||
|
if predictions.ndim == 1:
|
||||||
|
return __check_consistency_and_adapt(
|
||||||
|
true_labels, np.expand_dims(predictions, axis=1)
|
||||||
|
)
|
||||||
|
if true_labels.ndim == 1:
|
||||||
|
return __check_consistency_and_adapt(
|
||||||
|
np.expand_dims(true_labels, axis=1), predictions
|
||||||
|
)
|
||||||
|
if true_labels.shape != predictions.shape:
|
||||||
|
raise ValueError(
|
||||||
|
"True and predicted label matrices shapes are inconsistent %s %s."
|
||||||
|
% (true_labels.shape, predictions.shape)
|
||||||
|
)
|
||||||
|
_, nC = true_labels.shape
|
||||||
|
return true_labels, predictions, nC
|
||||||
|
|
||||||
|
|
||||||
|
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
|
||||||
|
# probabilitiesfron with respect to the true binary labels
|
||||||
|
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
|
||||||
|
def soft_single_metric_statistics(true_labels, posterior_probabilities):
|
||||||
|
assert len(true_labels) == len(
|
||||||
|
posterior_probabilities
|
||||||
|
), "Format not consistent between true and predicted labels."
|
||||||
|
tp = np.sum(posterior_probabilities[true_labels == 1])
|
||||||
|
fn = np.sum(1.0 - posterior_probabilities[true_labels == 1])
|
||||||
|
fp = np.sum(posterior_probabilities[true_labels == 0])
|
||||||
|
tn = np.sum(1.0 - posterior_probabilities[true_labels == 0])
|
||||||
|
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||||
|
|
||||||
|
|
||||||
|
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
|
||||||
|
# true_labels and predicted_labels are two vectors of shape (number_documents,)
|
||||||
|
def hard_single_metric_statistics(true_labels, predicted_labels):
|
||||||
|
assert len(true_labels) == len(
|
||||||
|
predicted_labels
|
||||||
|
), "Format not consistent between true and predicted labels."
|
||||||
|
nd = len(true_labels)
|
||||||
|
tp = np.sum(predicted_labels[true_labels == 1])
|
||||||
|
fp = np.sum(predicted_labels[true_labels == 0])
|
||||||
|
fn = np.sum(true_labels[predicted_labels == 0])
|
||||||
|
tn = nd - (tp + fp + fn)
|
||||||
|
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||||
|
|
||||||
|
|
||||||
|
def macro_average(
|
||||||
|
true_labels,
|
||||||
|
predicted_labels,
|
||||||
|
metric,
|
||||||
|
metric_statistics=hard_single_metric_statistics,
|
||||||
|
):
|
||||||
|
true_labels, predicted_labels, nC = __check_consistency_and_adapt(
|
||||||
|
true_labels, predicted_labels
|
||||||
|
)
|
||||||
|
return np.mean(
|
||||||
|
[
|
||||||
|
metric(metric_statistics(true_labels[:, c], predicted_labels[:, c]))
|
||||||
|
for c in range(nC)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def micro_average(
|
||||||
|
true_labels,
|
||||||
|
predicted_labels,
|
||||||
|
metric,
|
||||||
|
metric_statistics=hard_single_metric_statistics,
|
||||||
|
):
|
||||||
|
true_labels, predicted_labels, nC = __check_consistency_and_adapt(
|
||||||
|
true_labels, predicted_labels
|
||||||
|
)
|
||||||
|
|
||||||
|
accum = ContTable()
|
||||||
|
for c in range(nC):
|
||||||
|
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
|
||||||
|
accum = accum + other
|
||||||
|
|
||||||
|
return metric(accum)
|
||||||
|
|
||||||
|
|
||||||
|
def macroP(true_labels, predicted_labels):
|
||||||
|
return macro_average(true_labels, predicted_labels, precision)
|
||||||
|
|
||||||
|
|
||||||
|
def microP(true_labels, predicted_labels):
|
||||||
|
return micro_average(true_labels, predicted_labels, precision)
|
||||||
|
|
||||||
|
|
||||||
|
def macroR(true_labels, predicted_labels):
|
||||||
|
return macro_average(true_labels, predicted_labels, recall)
|
||||||
|
|
||||||
|
|
||||||
|
def microR(true_labels, predicted_labels):
|
||||||
|
return micro_average(true_labels, predicted_labels, recall)
|
||||||
|
|
||||||
|
|
||||||
|
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||||
|
def macroF1(true_labels, predicted_labels):
|
||||||
|
return macro_average(true_labels, predicted_labels, f1)
|
||||||
|
|
||||||
|
|
||||||
|
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||||
|
def microF1(true_labels, predicted_labels):
|
||||||
|
return micro_average(true_labels, predicted_labels, f1)
|
||||||
|
|
||||||
|
|
||||||
|
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||||
|
def macroK(true_labels, predicted_labels):
|
||||||
|
return macro_average(true_labels, predicted_labels, K)
|
||||||
|
|
||||||
|
|
||||||
|
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||||
|
def microK(true_labels, predicted_labels):
|
||||||
|
return micro_average(true_labels, predicted_labels, K)
|
|
@ -0,0 +1,182 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.getcwd(), "gfun"))
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from vgfs.commons import TfidfVectorizerMultilingual
|
||||||
|
from vgfs.learners.svms import MetaClassifier, get_learner
|
||||||
|
from vgfs.multilingualGen import MultilingualGen
|
||||||
|
from vgfs.transformerGen import TransformerGen
|
||||||
|
from vgfs.vanillaFun import VanillaFunGen
|
||||||
|
from vgfs.wceGen import WceGen
|
||||||
|
|
||||||
|
# TODO: save and load gfun model
|
||||||
|
|
||||||
|
|
||||||
|
class GeneralizedFunnelling:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
posterior,
|
||||||
|
wce,
|
||||||
|
multilingual,
|
||||||
|
transformer,
|
||||||
|
langs,
|
||||||
|
embed_dir,
|
||||||
|
n_jobs,
|
||||||
|
batch_size,
|
||||||
|
max_length,
|
||||||
|
lr,
|
||||||
|
epochs,
|
||||||
|
patience,
|
||||||
|
evaluate_step,
|
||||||
|
transformer_name,
|
||||||
|
):
|
||||||
|
# Forcing VFGs -----------
|
||||||
|
self.posteriors_vgf = posterior
|
||||||
|
self.wce_vgf = wce
|
||||||
|
self.multilingual_vgf = multilingual
|
||||||
|
self.trasformer_vgf = transformer
|
||||||
|
# ------------------------
|
||||||
|
self.langs = langs
|
||||||
|
self.embed_dir = embed_dir
|
||||||
|
self.cached = True
|
||||||
|
# Transformer VGF params
|
||||||
|
self.transformer_name = transformer_name
|
||||||
|
self.epochs = epochs
|
||||||
|
self.lr_transformer = lr
|
||||||
|
self.batch_size_transformer = batch_size
|
||||||
|
self.max_length = max_length
|
||||||
|
self.early_stopping = True
|
||||||
|
self.patience = patience
|
||||||
|
self.evaluate_step = evaluate_step
|
||||||
|
# -------------------
|
||||||
|
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.first_tier_learners = []
|
||||||
|
self.metaclassifier = None
|
||||||
|
self.aggfunc = "mean"
|
||||||
|
self.init()
|
||||||
|
|
||||||
|
def init(self):
|
||||||
|
print("[Init GeneralizedFunnelling]")
|
||||||
|
if self.posteriors_vgf:
|
||||||
|
fun = VanillaFunGen(
|
||||||
|
base_learner=get_learner(calibrate=True),
|
||||||
|
first_tier_parameters=None,
|
||||||
|
n_jobs=self.n_jobs,
|
||||||
|
)
|
||||||
|
self.first_tier_learners.append(fun)
|
||||||
|
|
||||||
|
if self.multilingual_vgf:
|
||||||
|
multilingual_vgf = MultilingualGen(
|
||||||
|
embed_dir=self.embed_dir,
|
||||||
|
langs=self.langs,
|
||||||
|
n_jobs=self.n_jobs,
|
||||||
|
cached=self.cached,
|
||||||
|
probabilistic=True,
|
||||||
|
)
|
||||||
|
self.first_tier_learners.append(multilingual_vgf)
|
||||||
|
|
||||||
|
if self.wce_vgf:
|
||||||
|
wce_vgf = WceGen(n_jobs=self.n_jobs)
|
||||||
|
self.first_tier_learners.append(wce_vgf)
|
||||||
|
|
||||||
|
if self.trasformer_vgf:
|
||||||
|
transformer_vgf = TransformerGen(
|
||||||
|
model_name=self.transformer_name,
|
||||||
|
lr=self.lr_transformer,
|
||||||
|
epochs=self.epochs,
|
||||||
|
batch_size=self.batch_size_transformer,
|
||||||
|
max_length=self.max_length,
|
||||||
|
device="cuda",
|
||||||
|
print_steps=50,
|
||||||
|
probabilistic=True,
|
||||||
|
evaluate_step=self.evaluate_step,
|
||||||
|
verbose=True,
|
||||||
|
patience=self.patience,
|
||||||
|
)
|
||||||
|
self.first_tier_learners.append(transformer_vgf)
|
||||||
|
|
||||||
|
self.metaclassifier = MetaClassifier(
|
||||||
|
meta_learner=get_learner(calibrate=True, kernel="rbf"),
|
||||||
|
meta_parameters=get_params(),
|
||||||
|
n_jobs=self.n_jobs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def init_vgfs_vectorizers(self):
|
||||||
|
for vgf in self.first_tier_learners:
|
||||||
|
if isinstance(vgf, (VanillaFunGen, MultilingualGen, WceGen)):
|
||||||
|
vgf.vectorizer = self.vectorizer
|
||||||
|
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
print("[Fitting GeneralizedFunnelling]")
|
||||||
|
self.vectorizer.fit(lX)
|
||||||
|
self.init_vgfs_vectorizers()
|
||||||
|
|
||||||
|
projections = []
|
||||||
|
print("- fitting first tier learners")
|
||||||
|
for vgf in self.first_tier_learners:
|
||||||
|
l_posteriors = vgf.fit_transform(lX, lY)
|
||||||
|
projections.append(l_posteriors)
|
||||||
|
|
||||||
|
agg = self.aggregate(projections)
|
||||||
|
self.metaclassifier.fit(agg, lY)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
projections = []
|
||||||
|
for vgf in self.first_tier_learners:
|
||||||
|
l_posteriors = vgf.transform(lX)
|
||||||
|
projections.append(l_posteriors)
|
||||||
|
agg = self.aggregate(projections)
|
||||||
|
l_out = self.metaclassifier.predict_proba(agg)
|
||||||
|
return l_out
|
||||||
|
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
return self.fit(lX, lY).transform(lX)
|
||||||
|
|
||||||
|
def aggregate(self, first_tier_projections):
|
||||||
|
if self.aggfunc == "mean":
|
||||||
|
aggregated = self._aggregate_mean(first_tier_projections)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
return aggregated
|
||||||
|
|
||||||
|
def _aggregate_mean(self, first_tier_projections):
|
||||||
|
# TODO: deafult dict for one-liner?
|
||||||
|
aggregated = {
|
||||||
|
lang: np.zeros(data.shape)
|
||||||
|
for lang, data in first_tier_projections[0].items()
|
||||||
|
}
|
||||||
|
for lang_projections in first_tier_projections:
|
||||||
|
for lang, projection in lang_projections.items():
|
||||||
|
aggregated[lang] += projection
|
||||||
|
|
||||||
|
# Computing mean
|
||||||
|
for lang, projection in aggregated.items():
|
||||||
|
aggregated[lang] /= len(first_tier_projections)
|
||||||
|
|
||||||
|
return aggregated
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
print("[GeneralizedFunnelling config]")
|
||||||
|
print(f"- langs: {self.langs}")
|
||||||
|
print("-- vgfs:")
|
||||||
|
|
||||||
|
for vgf in self.first_tier_learners:
|
||||||
|
pprint(vgf.get_config())
|
||||||
|
|
||||||
|
|
||||||
|
def get_params(optimc=False):
|
||||||
|
if not optimc:
|
||||||
|
return None
|
||||||
|
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||||
|
kernel = "rbf"
|
||||||
|
return [{"kernel": [kernel], "C": c_range, "gamma": ["auto"]}]
|
|
@ -0,0 +1,74 @@
|
||||||
|
from sklearn.preprocessing import normalize
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize(lX, l2=True):
|
||||||
|
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX
|
||||||
|
|
||||||
|
|
||||||
|
def XdotM(X, M, sif):
|
||||||
|
E = X.dot(M)
|
||||||
|
if sif:
|
||||||
|
E = remove_pc(E, npc=1)
|
||||||
|
return E
|
||||||
|
|
||||||
|
|
||||||
|
def remove_pc(X, npc=1):
|
||||||
|
"""
|
||||||
|
Remove the projection on the principal components
|
||||||
|
:param X: X[i,:] is a data point
|
||||||
|
:param npc: number of principal components to remove
|
||||||
|
:return: XX[i, :] is the data point after removing its projection
|
||||||
|
"""
|
||||||
|
pc = compute_pc(X, npc)
|
||||||
|
if npc == 1:
|
||||||
|
XX = X - X.dot(pc.transpose()) * pc
|
||||||
|
else:
|
||||||
|
XX = X - X.dot(pc.transpose()).dot(pc)
|
||||||
|
return XX
|
||||||
|
|
||||||
|
|
||||||
|
class TfidfVectorizerMultilingual:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.kwargs = kwargs
|
||||||
|
|
||||||
|
def fit(self, lX, ly=None):
|
||||||
|
self.langs = sorted(lX.keys())
|
||||||
|
self.vectorizer = {
|
||||||
|
l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs
|
||||||
|
}
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||||
|
|
||||||
|
def fit_transform(self, lX, ly=None):
|
||||||
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
|
||||||
|
def vocabulary(self, l=None):
|
||||||
|
if l is None:
|
||||||
|
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||||
|
else:
|
||||||
|
return self.vectorizer[l].vocabulary_
|
||||||
|
|
||||||
|
def get_analyzer(self, l=None):
|
||||||
|
if l is None:
|
||||||
|
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||||
|
else:
|
||||||
|
return self.vectorizer[l].build_analyzer()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_pc(X, npc=1):
|
||||||
|
"""
|
||||||
|
Compute the principal components.
|
||||||
|
:param X: X[i,:] is a data point
|
||||||
|
:param npc: number of principal components to remove
|
||||||
|
:return: component_[i,:] is the i-th pc
|
||||||
|
"""
|
||||||
|
if isinstance(X, np.matrix):
|
||||||
|
X = np.asarray(X)
|
||||||
|
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
|
||||||
|
svd.fit(X)
|
||||||
|
return svd.components_
|
|
@ -0,0 +1,354 @@
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
from scipy.sparse import issparse
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
|
from sklearn.preprocessing import normalize
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
|
|
||||||
|
def _sort_if_sparse(X):
|
||||||
|
if issparse(X) and not X.has_sorted_indices:
|
||||||
|
X.sort_indices()
|
||||||
|
|
||||||
|
|
||||||
|
def get_learner(calibrate=False, kernel="linear", C=1):
|
||||||
|
"""
|
||||||
|
instantiate scikit Support Vector Classifier
|
||||||
|
:param calibrate: boolean, whether to return posterior probabilities or not
|
||||||
|
:param kernel: string,kernel to be applied to the SVC
|
||||||
|
:param C: int or dict {'C': list of integer}, Regularization parameter
|
||||||
|
:return: Support Vector Classifier
|
||||||
|
"""
|
||||||
|
return SVC(
|
||||||
|
kernel=kernel,
|
||||||
|
probability=calibrate,
|
||||||
|
cache_size=1000,
|
||||||
|
C=C,
|
||||||
|
random_state=1,
|
||||||
|
gamma="auto",
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
|
||||||
|
if n_jobs == 1:
|
||||||
|
return {lang: transformer(lX[lang]) for lang in lX.keys()}
|
||||||
|
else:
|
||||||
|
langs = list(lX.keys())
|
||||||
|
transformations = Parallel(n_jobs=n_jobs)(
|
||||||
|
delayed(transformer)(lX[lang]) for lang in langs
|
||||||
|
)
|
||||||
|
return {lang: transformations[i] for i, lang in enumerate(langs)}
|
||||||
|
|
||||||
|
|
||||||
|
class MonolingualClassifier:
|
||||||
|
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||||
|
self.learner = base_learner
|
||||||
|
self.parameters = parameters
|
||||||
|
self.model = None
|
||||||
|
self.best_params_ = None
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
tinit = time.time()
|
||||||
|
_sort_if_sparse(X)
|
||||||
|
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
|
||||||
|
# multi-class format
|
||||||
|
if len(y.shape) == 2:
|
||||||
|
if self.parameters is not None:
|
||||||
|
self.parameters = [
|
||||||
|
{"estimator__" + key: params[key] for key in params.keys()}
|
||||||
|
for params in self.parameters
|
||||||
|
]
|
||||||
|
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
|
||||||
|
else:
|
||||||
|
self.model = self.learner
|
||||||
|
raise NotImplementedError(
|
||||||
|
"not working as a base-classifier for funneling if there are gaps in "
|
||||||
|
"the labels across languages"
|
||||||
|
)
|
||||||
|
|
||||||
|
# parameter optimization?
|
||||||
|
if self.parameters:
|
||||||
|
print("debug: optimizing parameters:", self.parameters)
|
||||||
|
self.model = GridSearchCV(
|
||||||
|
self.model,
|
||||||
|
param_grid=self.parameters,
|
||||||
|
refit=True,
|
||||||
|
cv=5,
|
||||||
|
n_jobs=self.n_jobs,
|
||||||
|
error_score=0,
|
||||||
|
verbose=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
# print(f"-- Fitting learner on matrices X={X.shape} Y={y.shape}")
|
||||||
|
|
||||||
|
self.model.fit(X, y)
|
||||||
|
if isinstance(self.model, GridSearchCV):
|
||||||
|
self.best_params_ = self.model.best_params_
|
||||||
|
print("best parameters: ", self.best_params_)
|
||||||
|
self.time = time.time() - tinit
|
||||||
|
return self
|
||||||
|
|
||||||
|
def decision_function(self, X):
|
||||||
|
assert self.model is not None, "predict called before fit"
|
||||||
|
_sort_if_sparse(X)
|
||||||
|
return self.model.decision_function(X)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
assert self.model is not None, "predict called before fit"
|
||||||
|
assert hasattr(
|
||||||
|
self.model, "predict_proba"
|
||||||
|
), "the probability predictions are not enabled in this model"
|
||||||
|
_sort_if_sparse(X)
|
||||||
|
return self.model.predict_proba(X)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
assert self.model is not None, "predict called before fit"
|
||||||
|
_sort_if_sparse(X)
|
||||||
|
return self.model.predict(X)
|
||||||
|
|
||||||
|
def best_params(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class NaivePolylingualClassifier:
|
||||||
|
"""
|
||||||
|
Is a mere set of independet MonolingualClassifiers
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||||
|
self.base_learner = base_learner
|
||||||
|
self.parameters = parameters
|
||||||
|
self.model = None
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
def fit(self, lX, ly):
|
||||||
|
"""
|
||||||
|
trains the independent monolingual classifiers
|
||||||
|
:param lX: a dictionary {language_label: X csr-matrix}
|
||||||
|
:param ly: a dictionary {language_label: y np.array}
|
||||||
|
:return: self
|
||||||
|
"""
|
||||||
|
tinit = time.time()
|
||||||
|
assert set(lX.keys()) == set(ly.keys()), "inconsistent language mappings in fit"
|
||||||
|
langs = list(lX.keys())
|
||||||
|
for lang in langs:
|
||||||
|
_sort_if_sparse(lX[lang])
|
||||||
|
|
||||||
|
models = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(
|
||||||
|
MonolingualClassifier(self.base_learner, parameters=self.parameters).fit
|
||||||
|
)((lX[lang]), ly[lang])
|
||||||
|
for lang in langs
|
||||||
|
)
|
||||||
|
|
||||||
|
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||||
|
self.empty_categories = {
|
||||||
|
lang: self.model[lang].empty_categories for lang in langs
|
||||||
|
}
|
||||||
|
self.time = time.time() - tinit
|
||||||
|
return self
|
||||||
|
|
||||||
|
def decision_function(self, lX):
|
||||||
|
"""
|
||||||
|
:param lX: a dictionary {language_label: X csr-matrix}
|
||||||
|
:return: a dictionary of classification scores for each class
|
||||||
|
"""
|
||||||
|
assert self.model is not None, "predict called before fit"
|
||||||
|
assert set(lX.keys()).issubset(
|
||||||
|
set(self.model.keys())
|
||||||
|
), "unknown languages requested in decision function"
|
||||||
|
langs = list(lX.keys())
|
||||||
|
scores = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs
|
||||||
|
)
|
||||||
|
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||||
|
|
||||||
|
def predict_proba(self, lX):
|
||||||
|
"""
|
||||||
|
:param lX: a dictionary {language_label: X csr-matrix}
|
||||||
|
:return: a dictionary of probabilities that each document belongs to each class
|
||||||
|
"""
|
||||||
|
assert self.model is not None, "predict called before fit"
|
||||||
|
assert set(lX.keys()).issubset(
|
||||||
|
set(self.model.keys())
|
||||||
|
), "unknown languages requested in decision function"
|
||||||
|
langs = list(lX.keys())
|
||||||
|
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
|
||||||
|
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs
|
||||||
|
)
|
||||||
|
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||||
|
|
||||||
|
def predict(self, lX):
|
||||||
|
"""
|
||||||
|
:param lX: a dictionary {language_label: X csr-matrix}
|
||||||
|
:return: a dictionary of predictions
|
||||||
|
"""
|
||||||
|
assert self.model is not None, "predict called before fit"
|
||||||
|
assert set(lX.keys()).issubset(
|
||||||
|
set(self.model.keys())
|
||||||
|
), "unknown languages requested in predict"
|
||||||
|
if self.n_jobs == 1:
|
||||||
|
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||||
|
else:
|
||||||
|
langs = list(lX.keys())
|
||||||
|
scores = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(self.model[lang].predict)(lX[lang]) for lang in langs
|
||||||
|
)
|
||||||
|
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||||
|
|
||||||
|
def best_params(self):
|
||||||
|
return {lang: model.best_params() for lang, model in self.model.items()}
|
||||||
|
|
||||||
|
|
||||||
|
class MetaClassifier:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
meta_learner,
|
||||||
|
meta_parameters=None,
|
||||||
|
n_jobs=-1,
|
||||||
|
standardize_range=None,
|
||||||
|
verbose=True,
|
||||||
|
):
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.model = MonolingualClassifier(
|
||||||
|
base_learner=meta_learner, parameters=meta_parameters, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
self.standardize_range = standardize_range
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
def fit(self, lZ, lY):
|
||||||
|
tinit = time.time()
|
||||||
|
Z, y = self.stack(lZ, lY)
|
||||||
|
|
||||||
|
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
||||||
|
Z = self.standardizer.fit_transform(Z)
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print(f"- fitting the metaclassifier on data shape: {Z.shape}")
|
||||||
|
self.model.fit(Z, y)
|
||||||
|
self.time = time.time() - tinit
|
||||||
|
|
||||||
|
def stack(self, lZ, lY=None):
|
||||||
|
langs = list(lZ.keys())
|
||||||
|
Z = np.vstack([lZ[lang] for lang in langs])
|
||||||
|
if lY is not None:
|
||||||
|
y = np.vstack([lY[lang] for lang in langs])
|
||||||
|
return Z, y
|
||||||
|
else:
|
||||||
|
return Z
|
||||||
|
|
||||||
|
# def stack(self, lZ, lY=None):
|
||||||
|
# X_stacked = np.vstack(list(lZ.values()))
|
||||||
|
# if lY is not None:
|
||||||
|
# Y_stacked = np.vstack(list(lY.values()))
|
||||||
|
# return X_stacked, Y_stacked
|
||||||
|
# else:
|
||||||
|
# return X_stacked
|
||||||
|
|
||||||
|
def predict(self, lZ):
|
||||||
|
lZ = _joblib_transform_multiling(
|
||||||
|
self.standardizer.transform, lZ, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||||
|
|
||||||
|
def predict_proba(self, lZ):
|
||||||
|
lZ = _joblib_transform_multiling(
|
||||||
|
self.standardizer.transform, lZ, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
return _joblib_transform_multiling(
|
||||||
|
self.model.predict_proba, lZ, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class StandardizeTransformer:
|
||||||
|
def __init__(self, axis=0, range=None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param axis:
|
||||||
|
:param range:
|
||||||
|
"""
|
||||||
|
assert range is None or isinstance(
|
||||||
|
range, slice
|
||||||
|
), "wrong format for range, should either be None or a slice"
|
||||||
|
self.axis = axis
|
||||||
|
self.yetfit = False
|
||||||
|
self.range = range
|
||||||
|
|
||||||
|
def fit(self, X):
|
||||||
|
# print("Applying z-score standardization...")
|
||||||
|
std = np.std(X, axis=self.axis, ddof=1)
|
||||||
|
self.std = np.clip(std, 1e-5, None)
|
||||||
|
self.mean = np.mean(X, axis=self.axis)
|
||||||
|
if self.range is not None:
|
||||||
|
ones = np.ones_like(self.std)
|
||||||
|
zeros = np.zeros_like(self.mean)
|
||||||
|
ones[self.range] = self.std[self.range]
|
||||||
|
zeros[self.range] = self.mean[self.range]
|
||||||
|
self.std = ones
|
||||||
|
self.mean = zeros
|
||||||
|
self.yetfit = True
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, X):
|
||||||
|
if not self.yetfit:
|
||||||
|
"transform called before fit"
|
||||||
|
return (X - self.mean) / self.std
|
||||||
|
|
||||||
|
def fit_transform(self, X):
|
||||||
|
return self.fit(X).transform(X)
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureSet2Posteriors:
|
||||||
|
"""
|
||||||
|
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
|
||||||
|
a multiclass SVM.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, verbose=True, l2=True, n_jobs=-1):
|
||||||
|
"""
|
||||||
|
Init the class.
|
||||||
|
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
|
||||||
|
:param l2: bool, whether to apply or not L2 normalization to the projection
|
||||||
|
:param n_jobs: int, number of concurrent workers.
|
||||||
|
"""
|
||||||
|
# self.embedder = embedder
|
||||||
|
self.l2 = l2
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.prob_classifier = MetaClassifier(
|
||||||
|
SVC(
|
||||||
|
kernel="rbf",
|
||||||
|
gamma="auto",
|
||||||
|
probability=True,
|
||||||
|
cache_size=1000,
|
||||||
|
random_state=1,
|
||||||
|
),
|
||||||
|
n_jobs=n_jobs,
|
||||||
|
verbose=verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
self.prob_classifier.fit(lX, lY)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
lP = self.predict_proba(lX)
|
||||||
|
lP = _normalize(lP, self.l2)
|
||||||
|
return lP
|
||||||
|
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
return self.fit(lX, lY).transform(lX)
|
||||||
|
|
||||||
|
def predict(self, lX):
|
||||||
|
return self.prob_classifier.predict(lX)
|
||||||
|
|
||||||
|
def predict_proba(self, lX):
|
||||||
|
return self.prob_classifier.predict_proba(lX)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize(lX, l2=True):
|
||||||
|
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX
|
|
@ -0,0 +1,176 @@
|
||||||
|
from os.path import expanduser, join
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from torchtext.vocab import Vectors
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
from vgfs.viewGen import ViewGen
|
||||||
|
from vgfs.commons import _normalize, XdotM
|
||||||
|
from vgfs.learners.svms import FeatureSet2Posteriors
|
||||||
|
|
||||||
|
|
||||||
|
class MultilingualGen(ViewGen):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
cached=False,
|
||||||
|
langs=["en", "it"],
|
||||||
|
embed_dir="~/embeddings",
|
||||||
|
n_jobs=-1,
|
||||||
|
probabilistic=False,
|
||||||
|
):
|
||||||
|
print("- init Multilingual View Generating Function")
|
||||||
|
self.embed_dir = embed_dir
|
||||||
|
self.langs = langs
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.cached = cached
|
||||||
|
self.vectorizer = None
|
||||||
|
self.sif = True
|
||||||
|
self.probabilistic = probabilistic
|
||||||
|
self.fitted = False
|
||||||
|
self._init()
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
if self.probabilistic:
|
||||||
|
self.feature2posterior_projector = FeatureSet2Posteriors(
|
||||||
|
n_jobs=self.n_jobs, verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
"""
|
||||||
|
Fitting Multilingual View Generating Function consists in
|
||||||
|
building/extracting the word embedding matrix for
|
||||||
|
each language;
|
||||||
|
"""
|
||||||
|
print("- fitting Multilingual View Generating Function")
|
||||||
|
self.l_vocab = self.vectorizer.vocabulary()
|
||||||
|
self.multi_embeddings, self.langs = self._load_embeddings(
|
||||||
|
self.embed_dir, self.cached
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.probabilistic:
|
||||||
|
self.feature2posterior_projector.fit(self.transform(lX), lY)
|
||||||
|
|
||||||
|
self.fitted = True
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
lX = self.vectorizer.transform(lX)
|
||||||
|
|
||||||
|
XdotMulti = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(XdotM)(lX[lang], self.multi_embeddings[lang], sif=self.sif)
|
||||||
|
for lang in self.langs
|
||||||
|
)
|
||||||
|
lZ = {lang: XdotMulti[i] for i, lang in enumerate(self.langs)}
|
||||||
|
lZ = _normalize(lZ, l2=True)
|
||||||
|
if self.probabilistic and self.fitted:
|
||||||
|
lZ = self.feature2posterior_projector.transform(lZ)
|
||||||
|
return lZ
|
||||||
|
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
return self.fit(lX, lY).transform(lX)
|
||||||
|
|
||||||
|
def _load_embeddings(self, embed_dir, cached):
|
||||||
|
if "muse" in self.embed_dir.lower():
|
||||||
|
multi_embeddings = load_MUSEs(
|
||||||
|
langs=self.langs,
|
||||||
|
l_vocab=self.vectorizer.vocabulary(),
|
||||||
|
dir_path=embed_dir,
|
||||||
|
cached=cached,
|
||||||
|
)
|
||||||
|
return multi_embeddings, sorted(multi_embeddings.keys())
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
return {
|
||||||
|
"name": "Multilingual VGF",
|
||||||
|
"embed_dir": self.embed_dir,
|
||||||
|
"langs": self.langs,
|
||||||
|
"n_jobs": self.n_jobs,
|
||||||
|
"cached": self.cached,
|
||||||
|
"sif": self.sif,
|
||||||
|
"probabilistic": self.probabilistic,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_MUSEs(langs, l_vocab, dir_path, cached=False):
|
||||||
|
dir_path = expanduser(dir_path)
|
||||||
|
cached_dir = join(dir_path, "cached")
|
||||||
|
nmax = 50000
|
||||||
|
|
||||||
|
l_embeddings = {}
|
||||||
|
|
||||||
|
for lang in langs:
|
||||||
|
embed_path = f"wiki.multi.{lang}.vec"
|
||||||
|
if cached:
|
||||||
|
l_embeddings[lang] = Vectors(embed_path, cache=cached_dir)
|
||||||
|
print(f"-- Loaded cached {lang} embeddings")
|
||||||
|
else:
|
||||||
|
(
|
||||||
|
_embed_matrix,
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
) = _load_vec(join(dir_path, embed_path), nmax)
|
||||||
|
l_embeddings[lang] = _embed_matrix
|
||||||
|
print(f"-- Loaded {nmax} {lang} embeddings")
|
||||||
|
|
||||||
|
# print("-- Extracting embeddings")
|
||||||
|
l_embeddings = extract(l_vocab, l_embeddings)
|
||||||
|
|
||||||
|
return l_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def _load_vec(emb_path, nmax=50000):
|
||||||
|
import io
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
vectors = []
|
||||||
|
word2id = {}
|
||||||
|
with io.open(emb_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f:
|
||||||
|
next(f)
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
word, vect = line.rstrip().split(" ", 1)
|
||||||
|
vect = np.fromstring(vect, sep=" ")
|
||||||
|
assert word not in word2id, "word found twice"
|
||||||
|
vectors.append(vect)
|
||||||
|
word2id[word] = len(word2id)
|
||||||
|
if len(word2id) == nmax:
|
||||||
|
break
|
||||||
|
id2word = {v: k for k, v in word2id.items()}
|
||||||
|
embeddings = np.vstack(vectors)
|
||||||
|
return embeddings, id2word, word2id
|
||||||
|
|
||||||
|
|
||||||
|
def extract(l_voc, l_embeddings):
|
||||||
|
"""
|
||||||
|
Reindex pretrained loaded embedding in order to match indexes
|
||||||
|
assigned by scikit vectorizer. Such indexes are consistent with
|
||||||
|
those used by Word Class Embeddings (since we deploy the same vectorizer)
|
||||||
|
:param lVoc: dict {lang : {word : id}}
|
||||||
|
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
|
||||||
|
"""
|
||||||
|
l_extracted = {}
|
||||||
|
for lang, words in l_voc.items():
|
||||||
|
source_id, target_id = reindex(words, l_embeddings[lang].stoi)
|
||||||
|
extraction = torch.zeros((len(words), l_embeddings[lang].vectors.shape[-1]))
|
||||||
|
extraction[source_id] = l_embeddings[lang].vectors[target_id]
|
||||||
|
l_extracted[lang] = extraction
|
||||||
|
return l_extracted
|
||||||
|
|
||||||
|
|
||||||
|
def reindex(vectorizer_words, pretrained_word2index):
|
||||||
|
if isinstance(vectorizer_words, dict):
|
||||||
|
vectorizer_words = list(
|
||||||
|
zip(*sorted(vectorizer_words.items(), key=lambda x: x[1]))
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
source_idx, target_idx = [], []
|
||||||
|
for i, word in enumerate(vectorizer_words):
|
||||||
|
if word not in pretrained_word2index:
|
||||||
|
continue
|
||||||
|
j = pretrained_word2index[word]
|
||||||
|
source_idx.append(i)
|
||||||
|
target_idx.append(j)
|
||||||
|
source_idx = np.asarray(source_idx)
|
||||||
|
target_idx = np.asarray(target_idx)
|
||||||
|
return source_idx, target_idx
|
|
@ -0,0 +1,390 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import transformers
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from torch.optim import AdamW
|
||||||
|
from torch.utils.data import DataLoader, Dataset
|
||||||
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||||
|
from vgfs.learners.svms import FeatureSet2Posteriors
|
||||||
|
|
||||||
|
from evaluation.evaluate import evaluate, log_eval
|
||||||
|
|
||||||
|
transformers.logging.set_verbosity_error()
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: early stopping, checkpointing, logging, model loading
|
||||||
|
# TODO: experiment name
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerGen:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name,
|
||||||
|
epochs=10,
|
||||||
|
lr=1e-5,
|
||||||
|
batch_size=4,
|
||||||
|
max_length=512,
|
||||||
|
print_steps=50,
|
||||||
|
device="cpu",
|
||||||
|
probabilistic=False,
|
||||||
|
n_jobs=-1,
|
||||||
|
evaluate_step=10,
|
||||||
|
verbose=False,
|
||||||
|
patience=5,
|
||||||
|
):
|
||||||
|
self.model_name = model_name
|
||||||
|
self.device = device
|
||||||
|
self.model = None
|
||||||
|
self.lr = lr
|
||||||
|
self.epochs = epochs
|
||||||
|
self.tokenizer = None
|
||||||
|
self.max_length = max_length
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.print_steps = print_steps
|
||||||
|
self.probabilistic = probabilistic
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.fitted = False
|
||||||
|
self.datasets = {}
|
||||||
|
self.evaluate_step = evaluate_step
|
||||||
|
self.verbose = verbose
|
||||||
|
self.patience = patience
|
||||||
|
self._init()
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
if self.probabilistic:
|
||||||
|
self.feature2posterior_projector = FeatureSet2Posteriors(
|
||||||
|
n_jobs=self.n_jobs, verbose=False
|
||||||
|
)
|
||||||
|
self.model_name = self._get_model_name(self.model_name)
|
||||||
|
print(
|
||||||
|
f"- init TransformerModel model_name: {self.model_name}, device: {self.device}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_model_name(self, name):
|
||||||
|
if "bert" == name:
|
||||||
|
name_model = "bert-base-uncased"
|
||||||
|
elif "mbert" == name:
|
||||||
|
name_model = "bert-base-multilingual-uncased"
|
||||||
|
elif "xlm" == name:
|
||||||
|
name_model = "xlm-roberta-base"
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
return name_model
|
||||||
|
|
||||||
|
def load_pretrained_model(self, model_name, num_labels):
|
||||||
|
return AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
model_name, num_labels=num_labels, output_hidden_states=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_tokenizer(self, model_name):
|
||||||
|
return AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
def init_model(self, model_name, num_labels):
|
||||||
|
return self.load_pretrained_model(model_name, num_labels), self.load_tokenizer(
|
||||||
|
model_name
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_train_val_data(self, lX, lY, split=0.2, seed=42):
|
||||||
|
tr_lX, tr_lY, val_lX, val_lY = {}, {}, {}, {}
|
||||||
|
|
||||||
|
for lang in lX.keys():
|
||||||
|
tr_X, val_X, tr_Y, val_Y = train_test_split(
|
||||||
|
lX[lang], lY[lang], test_size=split, random_state=seed, shuffle=False
|
||||||
|
)
|
||||||
|
tr_lX[lang] = tr_X
|
||||||
|
tr_lY[lang] = tr_Y
|
||||||
|
val_lX[lang] = val_X
|
||||||
|
val_lY[lang] = val_Y
|
||||||
|
|
||||||
|
return tr_lX, tr_lY, val_lX, val_lY
|
||||||
|
|
||||||
|
def build_dataloader(self, lX, lY, batch_size, split="train", shuffle=True):
|
||||||
|
l_tokenized = {lang: self._tokenize(data) for lang, data in lX.items()}
|
||||||
|
self.datasets[split] = MultilingualDatasetTorch(l_tokenized, lY, split=split)
|
||||||
|
return DataLoader(self.datasets[split], batch_size=batch_size, shuffle=shuffle)
|
||||||
|
|
||||||
|
def _tokenize(self, X):
|
||||||
|
return self.tokenizer(
|
||||||
|
X,
|
||||||
|
return_tensors="pt",
|
||||||
|
padding="max_length",
|
||||||
|
truncation=True,
|
||||||
|
max_length=self.max_length,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
if self.fitted:
|
||||||
|
return self
|
||||||
|
print("- fitting Transformer View Generating Function")
|
||||||
|
_l = list(lX.keys())[0]
|
||||||
|
self.num_labels = lY[_l].shape[-1]
|
||||||
|
self.model, self.tokenizer = self.init_model(
|
||||||
|
self.model_name, num_labels=self.num_labels
|
||||||
|
)
|
||||||
|
|
||||||
|
tr_lX, tr_lY, val_lX, val_lY = self.get_train_val_data(
|
||||||
|
lX, lY, split=0.2, seed=42
|
||||||
|
)
|
||||||
|
|
||||||
|
tra_dataloader = self.build_dataloader(
|
||||||
|
tr_lX, tr_lY, self.batch_size, split="train", shuffle=True
|
||||||
|
)
|
||||||
|
|
||||||
|
val_dataloader = self.build_dataloader(
|
||||||
|
val_lX, val_lY, self.batch_size, split="val", shuffle=False
|
||||||
|
)
|
||||||
|
|
||||||
|
experiment_name = f"{self.model_name}-{self.epochs}-{self.batch_size}" # TODO: add more params
|
||||||
|
trainer = Trainer(
|
||||||
|
model=self.model,
|
||||||
|
optimizer_name="adamW",
|
||||||
|
lr=self.lr,
|
||||||
|
device=self.device,
|
||||||
|
loss_fn=torch.nn.CrossEntropyLoss(),
|
||||||
|
print_steps=self.print_steps,
|
||||||
|
evaluate_step=self.evaluate_step,
|
||||||
|
patience=self.patience,
|
||||||
|
experiment_name=experiment_name,
|
||||||
|
)
|
||||||
|
trainer.train(
|
||||||
|
train_dataloader=tra_dataloader,
|
||||||
|
eval_dataloader=val_dataloader,
|
||||||
|
epochs=self.epochs,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.probabilistic:
|
||||||
|
self.feature2posterior_projector.fit(self.transform(lX), lY)
|
||||||
|
|
||||||
|
self.fitted = True
|
||||||
|
|
||||||
|
# self.save_vgf(path="models/vgf/transformers/")
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
_embeds = []
|
||||||
|
l_embeds = defaultdict(list)
|
||||||
|
|
||||||
|
dataloader = self.build_dataloader(
|
||||||
|
lX, lY=None, batch_size=self.batch_size, split="whole", shuffle=False
|
||||||
|
)
|
||||||
|
|
||||||
|
self.model.eval()
|
||||||
|
with torch.no_grad():
|
||||||
|
for input_ids, lang in dataloader:
|
||||||
|
input_ids = input_ids.to(self.device)
|
||||||
|
out = self.model(input_ids).hidden_states[-1]
|
||||||
|
batch_embeddings = out[:, 0, :].cpu().numpy()
|
||||||
|
_embeds.append((batch_embeddings, lang))
|
||||||
|
|
||||||
|
for embed, lang in _embeds:
|
||||||
|
for sample_embed, sample_lang in zip(embed, lang):
|
||||||
|
l_embeds[sample_lang].append(sample_embed)
|
||||||
|
|
||||||
|
if self.probabilistic and self.fitted:
|
||||||
|
l_embeds = self.feature2posterior_projector.transform(l_embeds)
|
||||||
|
|
||||||
|
return l_embeds
|
||||||
|
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
return self.fit(lX, lY).transform(lX)
|
||||||
|
|
||||||
|
def save_vgf(self, path):
|
||||||
|
print(f"- saving Transformer View Generating Function to {path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
return {
|
||||||
|
"name": "Transformer VGF",
|
||||||
|
"model_name": self.model_name,
|
||||||
|
"max_length": self.max_length,
|
||||||
|
"batch_size": self.batch_size,
|
||||||
|
"lr": self.lr,
|
||||||
|
"epochs": self.epochs,
|
||||||
|
"device": self.device,
|
||||||
|
"print_steps": self.print_steps,
|
||||||
|
"evaluate_step": self.evaluate_step,
|
||||||
|
"patience": self.patience,
|
||||||
|
"probabilistic": self.probabilistic,
|
||||||
|
}
|
||||||
|
|
||||||
|
class MultilingualDatasetTorch(Dataset):
|
||||||
|
def __init__(self, lX, lY, split="train"):
|
||||||
|
self.lX = lX
|
||||||
|
self.lY = lY
|
||||||
|
self.split = split
|
||||||
|
self.langs = []
|
||||||
|
self.init()
|
||||||
|
|
||||||
|
def init(self):
|
||||||
|
self.X = torch.vstack([data.input_ids for data in self.lX.values()])
|
||||||
|
if self.split != "whole":
|
||||||
|
self.Y = torch.vstack([torch.Tensor(data) for data in self.lY.values()])
|
||||||
|
self.langs = sum(
|
||||||
|
[
|
||||||
|
v
|
||||||
|
for v in {
|
||||||
|
lang: [lang] * len(data.input_ids) for lang, data in self.lX.items()
|
||||||
|
}.values()
|
||||||
|
],
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.X)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
if self.split == "whole":
|
||||||
|
return self.X[index], self.langs[index]
|
||||||
|
return self.X[index], self.Y[index], self.langs[index]
|
||||||
|
|
||||||
|
|
||||||
|
class Trainer:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model,
|
||||||
|
optimizer_name,
|
||||||
|
device,
|
||||||
|
loss_fn,
|
||||||
|
lr,
|
||||||
|
print_steps,
|
||||||
|
evaluate_step,
|
||||||
|
patience,
|
||||||
|
experiment_name,
|
||||||
|
):
|
||||||
|
self.device = device
|
||||||
|
self.model = model.to(device)
|
||||||
|
self.optimizer = self.init_optimizer(optimizer_name, lr)
|
||||||
|
self.evaluate_steps = evaluate_step
|
||||||
|
self.loss_fn = loss_fn.to(device)
|
||||||
|
self.print_steps = print_steps
|
||||||
|
self.earlystopping = EarlyStopping(
|
||||||
|
patience=patience,
|
||||||
|
checkpoint_path="models/vgfs/transformers/",
|
||||||
|
verbose=True,
|
||||||
|
experiment_name=experiment_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
def init_optimizer(self, optimizer_name, lr):
|
||||||
|
if optimizer_name.lower() == "adamw":
|
||||||
|
return AdamW(self.model.parameters(), lr=lr)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Optimizer {optimizer_name} not supported")
|
||||||
|
|
||||||
|
def train(self, train_dataloader, eval_dataloader, epochs=10):
|
||||||
|
print(
|
||||||
|
f"""- Training params:
|
||||||
|
- epochs: {epochs}
|
||||||
|
- learning rate: {self.optimizer.defaults['lr']}
|
||||||
|
- train batch size: {train_dataloader.batch_size}
|
||||||
|
- eval batch size: {'TODO'}
|
||||||
|
- max len: {train_dataloader.dataset.X.shape[-1]}\n""",
|
||||||
|
)
|
||||||
|
for epoch in range(epochs):
|
||||||
|
self.train_epoch(train_dataloader, epoch)
|
||||||
|
if (epoch + 1) % self.evaluate_steps == 0:
|
||||||
|
metric_watcher = self.evaluate(eval_dataloader)
|
||||||
|
stop = self.earlystopping(metric_watcher, self.model, epoch + 1)
|
||||||
|
if stop:
|
||||||
|
break
|
||||||
|
return self.model
|
||||||
|
|
||||||
|
def train_epoch(self, dataloader, epoch):
|
||||||
|
self.model.train()
|
||||||
|
for b_idx, (x, y, lang) in enumerate(dataloader):
|
||||||
|
self.optimizer.zero_grad()
|
||||||
|
y_hat = self.model(x.to(self.device))
|
||||||
|
loss = self.loss_fn(y_hat.logits, y.to(self.device))
|
||||||
|
loss.backward()
|
||||||
|
self.optimizer.step()
|
||||||
|
if b_idx % self.print_steps == 0:
|
||||||
|
print(f"Epoch: {epoch+1} Step: {b_idx+1} Loss: {loss:.4f}")
|
||||||
|
return self
|
||||||
|
|
||||||
|
def evaluate(self, dataloader):
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
lY = defaultdict(list)
|
||||||
|
lY_hat = defaultdict(list)
|
||||||
|
|
||||||
|
for b_idx, (x, y, lang) in enumerate(dataloader):
|
||||||
|
y_hat = self.model(x.to(self.device))
|
||||||
|
loss = self.loss_fn(y_hat.logits, y.to(self.device))
|
||||||
|
predictions = predict(y_hat.logits, classification_type="multilabel")
|
||||||
|
|
||||||
|
for l, _true, _pred in zip(lang, y, predictions):
|
||||||
|
lY[l].append(_true.detach().cpu().numpy())
|
||||||
|
lY_hat[l].append(_pred)
|
||||||
|
|
||||||
|
for lang in lY:
|
||||||
|
lY[lang] = np.vstack(lY[lang])
|
||||||
|
lY_hat[lang] = np.vstack(lY_hat[lang])
|
||||||
|
|
||||||
|
l_eval = evaluate(lY, lY_hat)
|
||||||
|
average_metrics = log_eval(l_eval, phase="validation")
|
||||||
|
return average_metrics[0] # macro-F1
|
||||||
|
|
||||||
|
|
||||||
|
class EarlyStopping:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
patience=5,
|
||||||
|
min_delta=0,
|
||||||
|
verbose=True,
|
||||||
|
checkpoint_path="checkpoint.pt",
|
||||||
|
experiment_name="experiment",
|
||||||
|
):
|
||||||
|
self.patience = patience
|
||||||
|
self.min_delta = min_delta
|
||||||
|
self.counter = 0
|
||||||
|
self.best_score = 0
|
||||||
|
self.best_epoch = None
|
||||||
|
self.verbose = verbose
|
||||||
|
self.checkpoint_path = checkpoint_path
|
||||||
|
self.experiment_name = experiment_name
|
||||||
|
|
||||||
|
def __call__(self, validation, model, epoch):
|
||||||
|
if validation > self.best_score:
|
||||||
|
print(
|
||||||
|
f"- earlystopping: Validation score improved from {self.best_score:.3f} to {validation:.3f}"
|
||||||
|
)
|
||||||
|
self.best_score = validation
|
||||||
|
self.counter = 0
|
||||||
|
# self.save_model(model)
|
||||||
|
elif validation < (self.best_score + self.min_delta):
|
||||||
|
self.counter += 1
|
||||||
|
print(
|
||||||
|
f"- earlystopping: Validation score decreased from {self.best_score:.3f} to {validation:.3f}, current patience: {self.patience - self.counter}"
|
||||||
|
)
|
||||||
|
if self.counter >= self.patience:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"- earlystopping: Early stopping at epoch {epoch}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def save_model(self, model):
|
||||||
|
_checkpoint_dir = os.path.join(self.checkpoint_path, self.experiment_name)
|
||||||
|
print(f"- saving model to {_checkpoint_dir}")
|
||||||
|
os.makedirs(_checkpoint_dir, exist_ok=True)
|
||||||
|
model.save_pretrained(_checkpoint_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def predict(logits, classification_type="multilabel"):
|
||||||
|
"""
|
||||||
|
Converts soft precictions to hard predictions [0,1]
|
||||||
|
"""
|
||||||
|
if classification_type == "multilabel":
|
||||||
|
prediction = torch.sigmoid(logits) > 0.5
|
||||||
|
elif classification_type == "singlelabel":
|
||||||
|
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||||
|
else:
|
||||||
|
print("unknown classification type")
|
||||||
|
|
||||||
|
return prediction.detach().cpu().numpy()
|
|
@ -0,0 +1,59 @@
|
||||||
|
from vgfs.viewGen import ViewGen
|
||||||
|
from vgfs.learners.svms import NaivePolylingualClassifier
|
||||||
|
from vgfs.commons import _normalize
|
||||||
|
|
||||||
|
|
||||||
|
class VanillaFunGen(ViewGen):
|
||||||
|
"""
|
||||||
|
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
|
||||||
|
Sebastiani in DOI: https://doi.org/10.1145/3326065
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
|
||||||
|
"""
|
||||||
|
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
|
||||||
|
:param base_learner: naive monolingual learners to be deployed as first-tier
|
||||||
|
learners. Should be able to return posterior probabilities.
|
||||||
|
:param base_learner:
|
||||||
|
:param n_jobs: integer, number of concurrent workers
|
||||||
|
"""
|
||||||
|
print("- init VanillaFun View Generating Function")
|
||||||
|
self.learners = base_learner
|
||||||
|
self.first_tier_parameters = first_tier_parameters
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.doc_projector = NaivePolylingualClassifier(
|
||||||
|
base_learner=self.learners,
|
||||||
|
parameters=self.first_tier_parameters,
|
||||||
|
n_jobs=self.n_jobs,
|
||||||
|
)
|
||||||
|
self.vectorizer = None
|
||||||
|
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
print("- fitting VanillaFun View Generating Function")
|
||||||
|
lX = self.vectorizer.transform(lX)
|
||||||
|
self.doc_projector.fit(lX, lY)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
"""
|
||||||
|
(1) Vectorize documents;
|
||||||
|
(2) Project them according to the learners SVMs;
|
||||||
|
(3) Apply L2 normalization to the projection and returns it.
|
||||||
|
:param lX: dict {lang: indexed documents}
|
||||||
|
:return: document projection to the common latent space.
|
||||||
|
"""
|
||||||
|
lX = self.vectorizer.transform(lX)
|
||||||
|
lZ = self.doc_projector.predict_proba(lX)
|
||||||
|
lZ = _normalize(lZ, l2=True)
|
||||||
|
return lZ
|
||||||
|
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
return self.fit(lX, lY).transform(lX)
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
return {
|
||||||
|
"name": "VanillaFunnelling VGF",
|
||||||
|
"base_learner": self.learners,
|
||||||
|
"first_tier_parameters": self.first_tier_parameters,
|
||||||
|
"n_jobs": self.n_jobs,
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class ViewGen(ABC):
|
||||||
|
"""
|
||||||
|
Abstract class for View Generating Functions (VGFs) implementations. Every ViewGen should implement these three methods in order to
|
||||||
|
be seamlessly integrated in the overall architecture.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def transform(self, lX):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
pass
|
|
@ -0,0 +1,66 @@
|
||||||
|
import numpy as np
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
from vgfs.commons import XdotM, _normalize
|
||||||
|
from vgfs.viewGen import ViewGen
|
||||||
|
|
||||||
|
|
||||||
|
class WceGen(ViewGen):
|
||||||
|
def __init__(self, n_jobs=-1):
|
||||||
|
print("- init Word-Class-Embeddings View Generating Function")
|
||||||
|
self.n_jobs = -1
|
||||||
|
self.sif = True
|
||||||
|
|
||||||
|
def fit(self, lX, lY):
|
||||||
|
print("- fitting Word-Class-Embeddings View Generating Function")
|
||||||
|
lX = self.vectorizer.transform(lX)
|
||||||
|
self.langs = sorted(lX.keys())
|
||||||
|
wce = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(wce_matrix)(lX[lang], lY[lang]) for lang in self.langs
|
||||||
|
)
|
||||||
|
self.l_wce = {lang: wce[i] for i, lang in enumerate(self.langs)}
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
lX = self.vectorizer.transform(lX)
|
||||||
|
XdotWce = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(XdotM)(lX[lang], self.l_wce[lang], sif=self.sif)
|
||||||
|
for lang in self.langs
|
||||||
|
)
|
||||||
|
lZ = {l: XdotWce[i] for i, l in enumerate(self.langs)}
|
||||||
|
lZ = _normalize(lZ, l2=True)
|
||||||
|
return lZ
|
||||||
|
|
||||||
|
def fit_transform(self, lX, lY):
|
||||||
|
return self.fit(lX, lY).transform(lX)
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
return {
|
||||||
|
"name": "Word-Class Embeddings VGF",
|
||||||
|
"n_jobs": self.n_jobs,
|
||||||
|
"sif": self.sif,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def wce_matrix(X, Y):
|
||||||
|
wce = supervised_embeddings_tfidf(X, Y)
|
||||||
|
wce = zscores(wce, axis=0)
|
||||||
|
return wce
|
||||||
|
|
||||||
|
|
||||||
|
def supervised_embeddings_tfidf(X, Y):
|
||||||
|
tfidf_norm = X.sum(axis=0)
|
||||||
|
tfidf_norm[tfidf_norm == 0] = 1
|
||||||
|
F = (X.T).dot(Y) / tfidf_norm.T
|
||||||
|
return np.asarray(F)
|
||||||
|
|
||||||
|
|
||||||
|
def zscores(X, axis=0):
|
||||||
|
"""
|
||||||
|
scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||||
|
:param X:
|
||||||
|
:param axis:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
|
||||||
|
mean = np.mean(X, axis=axis)
|
||||||
|
return (X - mean) / std
|
|
@ -0,0 +1,128 @@
|
||||||
|
from os.path import expanduser
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
from dataManager.multiNewsDataset import MultiNewsDataset
|
||||||
|
from dataManager.amazonDataset import AmazonDataset
|
||||||
|
from dataManager.multilingualDatset import MultilingualDataset
|
||||||
|
|
||||||
|
from gfun.generalizedFunnelling import GeneralizedFunnelling
|
||||||
|
|
||||||
|
from evaluation.evaluate import evaluate, log_eval
|
||||||
|
|
||||||
|
from time import time
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: a cleaner way to save the model?
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
# Loading dataset ------------------------
|
||||||
|
RCV_DATAPATH = expanduser(
|
||||||
|
"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle"
|
||||||
|
)
|
||||||
|
# dataset = MultiNewsDataset(expanduser(args.dataset_path))
|
||||||
|
# dataset = AmazonDataset(domains=args.domains,nrows=args.nrows,min_count=args.min_count,max_labels=args.max_labels)
|
||||||
|
dataset = (
|
||||||
|
MultilingualDataset(dataset_name="rcv1-2")
|
||||||
|
.load(RCV_DATAPATH)
|
||||||
|
.reduce_data(langs=["en", "it", "fr"], maxn=250)
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(dataset, MultilingualDataset):
|
||||||
|
lX, lY = dataset.training()
|
||||||
|
lX_te, lY_te = dataset.test()
|
||||||
|
else:
|
||||||
|
_lX = dataset.dX
|
||||||
|
_lY = dataset.dY
|
||||||
|
# ----------------------------------------
|
||||||
|
|
||||||
|
tinit = time()
|
||||||
|
|
||||||
|
if args.load_pretrained is None:
|
||||||
|
assert any(
|
||||||
|
[
|
||||||
|
args.posteriors,
|
||||||
|
args.wce,
|
||||||
|
args.multilingual,
|
||||||
|
args.multilingual,
|
||||||
|
args.transformer,
|
||||||
|
]
|
||||||
|
), "At least one of VGF must be True"
|
||||||
|
|
||||||
|
gfun = GeneralizedFunnelling(
|
||||||
|
posterior=args.posteriors,
|
||||||
|
multilingual=args.multilingual,
|
||||||
|
wce=args.wce,
|
||||||
|
transformer=args.transformer,
|
||||||
|
langs=dataset.langs(),
|
||||||
|
embed_dir="~/resources/muse_embeddings",
|
||||||
|
n_jobs=args.n_jobs,
|
||||||
|
max_length=args.max_length,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
epochs=args.epochs,
|
||||||
|
lr=args.lr,
|
||||||
|
patience=args.patience,
|
||||||
|
evaluate_step=args.evaluate_step,
|
||||||
|
transformer_name=args.transformer_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
gfun.get_config()
|
||||||
|
|
||||||
|
gfun.fit(lX, lY)
|
||||||
|
|
||||||
|
# Saving Model ------------------------
|
||||||
|
with open("models/gfun/gfun_model.pkl", "wb") as f:
|
||||||
|
print(f"- saving model to {f.name}")
|
||||||
|
pickle.dump(gfun, f)
|
||||||
|
# -------------------------------------
|
||||||
|
|
||||||
|
preds = gfun.transform(lX)
|
||||||
|
|
||||||
|
train_eval = evaluate(lY, preds)
|
||||||
|
log_eval(train_eval, phase="train")
|
||||||
|
|
||||||
|
timetr = time()
|
||||||
|
print(f"- training completed in {timetr - tinit:.2f} seconds")
|
||||||
|
|
||||||
|
# Loading Model ------------------------
|
||||||
|
if args.load_pretrained is not None:
|
||||||
|
with open("models/gfun/gfun_model.pkl", "rb") as f:
|
||||||
|
print(f"- loading model from {f.name}")
|
||||||
|
gfun = pickle.load(f)
|
||||||
|
timetr = time()
|
||||||
|
# --------------------------------------
|
||||||
|
|
||||||
|
test_eval = evaluate(lY_te, gfun.transform(lX_te))
|
||||||
|
log_eval(test_eval, phase="test")
|
||||||
|
|
||||||
|
timeval = time()
|
||||||
|
print(f"- testing completed in {timeval - timetr:.2f} seconds")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--load_pretrained", type=str, default=None)
|
||||||
|
# Dataset parameters -------------------
|
||||||
|
parser.add_argument("--domains", type=str, default="all")
|
||||||
|
parser.add_argument("--nrows", type=int, default=10000)
|
||||||
|
parser.add_argument("--min_count", type=int, default=10)
|
||||||
|
parser.add_argument("--max_labels", type=int, default=50)
|
||||||
|
# gFUN parameters ----------------------
|
||||||
|
parser.add_argument("-p", "--posteriors", action="store_true")
|
||||||
|
parser.add_argument("-m", "--multilingual", action="store_true")
|
||||||
|
parser.add_argument("-w", "--wce", action="store_true")
|
||||||
|
parser.add_argument("-t", "--transformer", action="store_true")
|
||||||
|
parser.add_argument("--n_jobs", type=int, default=1)
|
||||||
|
# transformer parameters ---------------
|
||||||
|
parser.add_argument("--transformer_name", type=str, default="mbert")
|
||||||
|
parser.add_argument("--batch_size", type=int, default=32)
|
||||||
|
parser.add_argument("--epochs", type=int, default=10)
|
||||||
|
parser.add_argument("--lr", type=float, default=1e-5)
|
||||||
|
parser.add_argument("--max_length", type=int, default=512)
|
||||||
|
parser.add_argument("--patience", type=int, default=5)
|
||||||
|
parser.add_argument("--evaluate_step", type=int, default=10)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
|
@ -0,0 +1,60 @@
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def plot_distribution(
|
||||||
|
x,
|
||||||
|
y,
|
||||||
|
labels,
|
||||||
|
title,
|
||||||
|
figsize=(10, 5),
|
||||||
|
logscale=False,
|
||||||
|
notes="",
|
||||||
|
max_labels=-1,
|
||||||
|
save=False,
|
||||||
|
path=None,
|
||||||
|
):
|
||||||
|
# sort values and labels accordingly
|
||||||
|
y, labels = zip(*sorted(zip(y, labels), reverse=True))
|
||||||
|
|
||||||
|
if max_labels != -1:
|
||||||
|
x = x[:max_labels]
|
||||||
|
y = y[:max_labels]
|
||||||
|
labels = labels[:max_labels]
|
||||||
|
|
||||||
|
plt.figure(figsize=figsize)
|
||||||
|
plt.bar(x, y)
|
||||||
|
plt.xticks(x, labels, rotation=90)
|
||||||
|
|
||||||
|
if len(notes) != 0:
|
||||||
|
_title = f"{title} - {notes}"
|
||||||
|
if max_labels != -1:
|
||||||
|
_title += f" - Showing {max_labels} top labels"
|
||||||
|
|
||||||
|
plt.title(_title)
|
||||||
|
|
||||||
|
if logscale:
|
||||||
|
plt.yscale("symlog")
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
# plt.show()
|
||||||
|
if save:
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
path = f"{path}/{title}_{now.strftime('%m%d_%H%M')}.png"
|
||||||
|
plt.savefig(path)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def plot_histogram(x, title, figsize=(10, 5), save=False, path=None):
|
||||||
|
plt.figure(figsize=figsize)
|
||||||
|
plt.hist(x)
|
||||||
|
# plt.xticks(x, lables, rotation=90)
|
||||||
|
plt.yscale("symlog")
|
||||||
|
plt.title(title)
|
||||||
|
# plt.show()
|
||||||
|
if save:
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
path = f"{path}/{title}_{now.strftime('%m%d_%H%M')}.png"
|
||||||
|
plt.savefig(path)
|
||||||
|
plt.close()
|
Loading…
Reference in New Issue