bulk upload after refactoring
This commit is contained in:
commit
6b75483b55
|
@ -0,0 +1,179 @@
|
|||
.vscode/*
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
!.vscode/*.code-snippets
|
||||
|
||||
# Local History for Visual Studio Code
|
||||
.history/
|
||||
|
||||
# Built Visual Studio Code Extensions
|
||||
*.vsix
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
__pycache__
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# user defined
|
||||
out/*
|
||||
amazon_cateogories.bu.txt
|
||||
models/*
|
|
@ -0,0 +1,21 @@
|
|||
Appliances
|
||||
Arts Crafts and Sewing
|
||||
Automotive
|
||||
CDs and Vinyl
|
||||
Cell Phones and Accessories
|
||||
Electronics
|
||||
Grocery and Gourmet Food
|
||||
Home and Kitchen
|
||||
Industrial and Scientific
|
||||
Luxury Beauty
|
||||
Magazine Subscriptions
|
||||
Movies and TV
|
||||
Musical Instruments
|
||||
Office Products
|
||||
Patio Lawn and Garden
|
||||
Pet Supplies
|
||||
Software
|
||||
Sports and Outdoors
|
||||
Tools and Home Improvement
|
||||
Toys and Games
|
||||
Video Games
|
|
@ -0,0 +1,370 @@
|
|||
import gzip
|
||||
import os
|
||||
import re
|
||||
import warnings
|
||||
from argparse import ArgumentParser
|
||||
from collections import Counter
|
||||
|
||||
import numpy as np
|
||||
from bs4 import BeautifulSoup
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
from plotters.distributions import plot_distribution
|
||||
|
||||
# TODO: AmazonDataset should be a instanc of MultimodalDataset
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
|
||||
|
||||
BASEPATH = "/home/moreo/Datasets/raw"
|
||||
with open("dataManager/excluded.csv", "r") as f:
|
||||
EXCLUDED = f.read().splitlines()
|
||||
REGEX = re.compile(r"\s{2,}", re.MULTILINE)
|
||||
|
||||
|
||||
def parse(dataset_name, ext="json.gz", nrows=0):
|
||||
dataset_name = dataset_name.replace(" ", "_")
|
||||
meta_path = os.path.join(BASEPATH, f"meta_{dataset_name}.{ext}")
|
||||
path = os.path.join(BASEPATH, f"{dataset_name}.{ext}")
|
||||
|
||||
mapper = {"false": False, "true": True}
|
||||
data = []
|
||||
metadata = []
|
||||
|
||||
_data = gzip.open(path, "r")
|
||||
_metadata = gzip.open(meta_path, "r")
|
||||
for i, (d, m) in enumerate(zip(_data, _metadata)):
|
||||
data.append(eval(d.replace(b"&", b"&"), mapper))
|
||||
metadata.append(eval(m.replace(b"&", b"&"), mapper))
|
||||
if i + 1 == nrows:
|
||||
break
|
||||
|
||||
return data, metadata
|
||||
|
||||
|
||||
def get_categories(data, min_count=0):
|
||||
if data[0].get("category", None) is None:
|
||||
return [], set()
|
||||
|
||||
categories = []
|
||||
for item in data:
|
||||
if item["category"] != "":
|
||||
categories.extend(item["category"])
|
||||
categories = list(filter(lambda x: x not in EXCLUDED, categories))
|
||||
# return categories, sorted(set(categories))
|
||||
return categories, _filter_counter(Counter(categories), min_count)
|
||||
|
||||
|
||||
def _filter_counter(counter, min_count):
|
||||
return {k: v for k, v in counter.items() if v >= min_count}
|
||||
|
||||
|
||||
def get_main_cat(data, min_count=0):
|
||||
if data[0].get("main_cat", None) is None:
|
||||
return [], set()
|
||||
|
||||
main_cats = [item["main_cat"] for item in data if item["main_cat"] != ""]
|
||||
main_cats = list(filter(lambda x: x not in EXCLUDED, main_cats))
|
||||
# return main_cats, sorted(set(main_cats))
|
||||
return main_cats, _filter_counter(Counter(main_cats), min_count)
|
||||
|
||||
|
||||
def filter_sample_with_images(metadata):
|
||||
# TODO: check whether images are really available and store them locally
|
||||
# print(f"(Pre-filter) Total items: {len(metadata)}")
|
||||
data = []
|
||||
for i, m in enumerate(metadata):
|
||||
if "imageURL" not in m.keys():
|
||||
continue
|
||||
if len(m["imageURL"]) != 0 or len(m["imageURLHighRes"]) != 0:
|
||||
data.append(m)
|
||||
# print(f"(Post-filter) Total items: {len(data)}")
|
||||
return data
|
||||
|
||||
|
||||
def select_description(descriptions):
|
||||
"""
|
||||
Some items have multiple descriptions (len(item["description"]) > 1).
|
||||
Most of these descriptions are just empty strings. Some items instead actually have
|
||||
multiple strings describing them
|
||||
At the moment, we rely on a simple heuristic: select the longest string and use it
|
||||
the only description.
|
||||
"""
|
||||
if len(descriptions) == 0:
|
||||
return [""]
|
||||
return [max(descriptions, key=len)]
|
||||
|
||||
|
||||
def build_product_json(metadata, binarizer):
|
||||
data = []
|
||||
for item in metadata:
|
||||
if len(item["description"]) != 1:
|
||||
item["description"] = select_description(item["description"])
|
||||
|
||||
product = {
|
||||
"asin": item["asin"],
|
||||
"title": item["title"],
|
||||
"description": item["description"],
|
||||
# TODO: some items have multiple descriptions (len(item["description"]) > 1))
|
||||
"cleaned_description": clean_description(
|
||||
BeautifulSoup(
|
||||
item["title"] + ". " + item["description"][0],
|
||||
features="html.parser",
|
||||
).text
|
||||
),
|
||||
# TODO: is it faster to call transform on the whole dataset?
|
||||
"main_category": item["main_cat"],
|
||||
"categories": item["category"],
|
||||
"all_categories": _get_cats(item["main_cat"], item["category"]),
|
||||
"vect_categories": binarizer.transform(
|
||||
[_get_cats(item["main_cat"], item["category"])]
|
||||
)[0],
|
||||
}
|
||||
data.append(product)
|
||||
return data
|
||||
|
||||
|
||||
def _get_cats(main_cat, cats):
|
||||
return [main_cat] + cats
|
||||
|
||||
|
||||
def get_label_binarizer(cats):
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([cats])
|
||||
return mlb
|
||||
|
||||
|
||||
def clean_description(description):
|
||||
description = re.sub(REGEX, " ", description)
|
||||
description = description.rstrip()
|
||||
description = description.replace("\t", "")
|
||||
description = description.replace("\n", " ")
|
||||
return description
|
||||
|
||||
|
||||
def construct_target_matrix(data):
|
||||
return np.stack([d["vect_categories"] for d in data], axis=0)
|
||||
|
||||
|
||||
def get_all_classes(counter_cats, counter_sub_cats):
|
||||
if len(counter_cats) == 0:
|
||||
return counter_sub_cats.keys()
|
||||
elif len(counter_sub_cats) == 0:
|
||||
return counter_cats.keys()
|
||||
else:
|
||||
return list(counter_cats.keys()) + list(counter_sub_cats.keys())
|
||||
|
||||
|
||||
class AmazonDataset:
|
||||
def __init__(
|
||||
self,
|
||||
domains=["Appliances", "Automotive", "Movies and TV"],
|
||||
basepath="/home/moreo/Datasets/raw",
|
||||
min_count=10,
|
||||
max_labels=50,
|
||||
nrows=1000,
|
||||
):
|
||||
print(f"[Init AmazonDataset]")
|
||||
print(f"- Domains: {domains}")
|
||||
self.REGEX = re.compile(r"\s{2,}", re.MULTILINE)
|
||||
with open("dataManager/excluded.csv", "r") as f:
|
||||
self.EXCLUDED = f.read().splitlines()
|
||||
self.basepath = basepath
|
||||
self.domains = self.parse_domains(domains)
|
||||
self.nrows = nrows
|
||||
self.min_count = min_count
|
||||
self.max_labels = max_labels
|
||||
self.len_data = 0
|
||||
self.domain_data = self.load_data()
|
||||
self.labels, self.domain_labels = self.get_all_cats()
|
||||
self.label_binarizer = get_label_binarizer(self.labels)
|
||||
self.vectorized_labels = self.vecorize_labels()
|
||||
self.dX = self.construct_data_matrix()
|
||||
self.dY = self.construct_target_matrix()
|
||||
self.langs = ["en"]
|
||||
|
||||
def parse_domains(self, domains):
|
||||
with open("amazon_categories.txt", "r") as f:
|
||||
all_domains = f.read().splitlines()
|
||||
if domains == "all":
|
||||
return all_domains
|
||||
else:
|
||||
assert all([d in all_domains for d in domains]), "Invalid domain name"
|
||||
return domains
|
||||
|
||||
def parse(self, dataset_name, nrows, ext="json.gz"):
|
||||
dataset_name = dataset_name.replace(" ", "_")
|
||||
meta_path = os.path.join(self.basepath, f"meta_{dataset_name}.{ext}")
|
||||
path = os.path.join(self.basepath, f"{dataset_name}.{ext}")
|
||||
|
||||
mapper = {"false": False, "true": True}
|
||||
data = []
|
||||
metadata = []
|
||||
|
||||
_data = gzip.open(path, "r")
|
||||
_metadata = gzip.open(meta_path, "r")
|
||||
for i, (d, m) in enumerate(zip(_data, _metadata)):
|
||||
data.append(eval(d.replace(b"&", b"&"), mapper))
|
||||
metadata.append(eval(m.replace(b"&", b"&"), mapper))
|
||||
if i + 1 == nrows:
|
||||
break
|
||||
|
||||
return data, metadata
|
||||
|
||||
def load_data(self):
|
||||
print(f"- Loading up to {self.nrows} items per domain")
|
||||
domain_data = {}
|
||||
for domain in self.domains:
|
||||
_, metadata = self.parse(domain, nrows=self.nrows)
|
||||
metadata = filter_sample_with_images(metadata)
|
||||
domain_data[domain] = self.build_product_scheme(metadata)
|
||||
self.len_data += len(metadata)
|
||||
print(f"- Loaded {self.len_data} items")
|
||||
return domain_data
|
||||
|
||||
def get_all_cats(self):
|
||||
assert len(self.domain_data) != 0, "Load data first"
|
||||
labels = set()
|
||||
domain_labels = {}
|
||||
for domain, data in self.domain_data.items():
|
||||
_, counter_cats = self._get_counter_cats(data, self.min_count)
|
||||
labels.update(counter_cats.keys())
|
||||
domain_labels[domain] = counter_cats
|
||||
print(f"- Found {len(labels)} labels")
|
||||
return labels, domain_labels
|
||||
|
||||
def export_to_torch(self):
|
||||
pass
|
||||
|
||||
def get_label_binarizer(self):
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([self.labels])
|
||||
return mlb
|
||||
|
||||
def vecorize_labels(self):
|
||||
for domain, data in self.domain_data.items():
|
||||
for item in data:
|
||||
item["vect_categories"] = self.label_binarizer.transform(
|
||||
[item["all_categories"]]
|
||||
)[0]
|
||||
|
||||
def build_product_scheme(self, metadata):
|
||||
data = []
|
||||
for item in metadata:
|
||||
if len(item["description"]) != 1:
|
||||
_desc = self._select_description(item["description"])
|
||||
else:
|
||||
_desc = item["description"][0]
|
||||
|
||||
product = {
|
||||
"asin": item["asin"],
|
||||
"title": item["title"],
|
||||
"description": _desc,
|
||||
# TODO: some items have multiple descriptions (len(item["description"]) > 1))
|
||||
"cleaned_text": self._clean_description(
|
||||
BeautifulSoup(
|
||||
item["title"] + ". " + _desc,
|
||||
features="html.parser",
|
||||
).text
|
||||
),
|
||||
# TODO: is it faster to call transform on the whole dataset?
|
||||
"main_category": item["main_cat"],
|
||||
"categories": item["category"],
|
||||
"all_categories": self._get_cats(item["main_cat"], item["category"]),
|
||||
# "vect_categories": binarizer.transform(
|
||||
# [_get_cats(item["main_cat"], item["category"])]
|
||||
# )[0],
|
||||
}
|
||||
data.append(product)
|
||||
return data
|
||||
|
||||
def construct_data_matrix(self):
|
||||
dX = {}
|
||||
for domain, data in self.domain_data.items():
|
||||
dX[domain] = [d["cleaned_text"] for d in data]
|
||||
return dX
|
||||
|
||||
def construct_target_matrix(self):
|
||||
dY = {}
|
||||
for domain, data in self.domain_data.items():
|
||||
dY[domain] = np.stack([d["vect_categories"] for d in data], axis=0)
|
||||
return dY
|
||||
|
||||
def get_overall_label_matrix(self):
|
||||
assert hasattr(self, "label_matrices"), "Init label matrices first"
|
||||
return np.vstack([x for x in self.dY.values()])
|
||||
|
||||
def _get_counter_cats(self, data, min_count):
|
||||
cats = []
|
||||
for item in data:
|
||||
cats.extend(item["all_categories"])
|
||||
cats = list(filter(lambda x: x not in self.EXCLUDED, cats))
|
||||
return cats, self._filter_counter(Counter(cats), min_count)
|
||||
|
||||
def _filter_counter(self, counter, min_count):
|
||||
return {k: v for k, v in counter.items() if v >= min_count}
|
||||
|
||||
def _clean_description(self, description):
|
||||
description = re.sub(self.REGEX, " ", description)
|
||||
description = description.rstrip()
|
||||
description = description.replace("\t", "")
|
||||
description = description.replace("\n", " ")
|
||||
return description
|
||||
|
||||
def _get_cats(self, main_cat, cats):
|
||||
return [main_cat] + cats
|
||||
|
||||
def _select_description(self, descriptions) -> str:
|
||||
"""
|
||||
Some items have multiple descriptions (len(item["description"]) > 1).
|
||||
Most of these descriptions are just empty strings. Some items instead actually have
|
||||
multiple strings describing them
|
||||
At the moment, we rely on a simple heuristic: select the longest string and use it
|
||||
the only description.
|
||||
"""
|
||||
if len(descriptions) == 0:
|
||||
return ""
|
||||
return max(descriptions, key=len)
|
||||
|
||||
def plot_label_distribution(self):
|
||||
overall_mat = self.get_overall_label_matrix()
|
||||
plot_distribution(
|
||||
np.arange(len(self.labels)),
|
||||
np.sum(overall_mat, axis=0),
|
||||
title="Amazon Dataset",
|
||||
labels=self.labels,
|
||||
notes=overall_mat.shape,
|
||||
max_labels=args.max_labels,
|
||||
figsize=(10, 10),
|
||||
save=True,
|
||||
path="out",
|
||||
)
|
||||
|
||||
def plot_per_domain_label_distribution(self):
|
||||
for domain, matrix in self.vecorize_labels:
|
||||
pass
|
||||
|
||||
|
||||
def main(args):
|
||||
dataset = AmazonDataset(
|
||||
domains=args.domains,
|
||||
nrows=args.nrows,
|
||||
min_count=args.min_count,
|
||||
max_labels=args.max_labels,
|
||||
)
|
||||
|
||||
dataset.plot_label_distribution()
|
||||
exit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.path.append("/home/andreapdr/devel/gFunMultiModal/")
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--domains", type=str, default="all")
|
||||
parser.add_argument("--nrows", type=int, default=10000)
|
||||
parser.add_argument("--min_count", type=int, default=10)
|
||||
parser.add_argument("--max_labels", type=int, default=50)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
|
@ -0,0 +1,27 @@
|
|||
</span></span></span>
|
||||
</span></span></span>
|
||||
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
|
||||
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/handmade/brand/logos/2018/subnav_logo._CB502360610_.png" class="nav-categ-image" alt="Handmade"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/handmade/brand/logos/2018/subnav_logo._CB502360610_.png" class="nav-categ-image" alt="Handmade"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
|
||||
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
|
||||
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
||||
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
|
Can't render this file because it contains an unexpected character in line 3 and column 10.
|
|
@ -0,0 +1,142 @@
|
|||
import re
|
||||
from os import listdir
|
||||
from os.path import isdir, join
|
||||
|
||||
from dataManager.torchDataset import TorchMultiNewsDataset
|
||||
|
||||
# TODO: labels must be aligned between languages
|
||||
# TODO: remove copyright and also tags (doc.split("More about:")[0])
|
||||
# TODO: define fn to represent the dataset as a torch Dataset
|
||||
# TODO: this should be a instance of a abstract MultimodalMultilingualDataset
|
||||
|
||||
|
||||
class MultiNewsDataset:
|
||||
def __init__(self, data_dir, excluded_langs=[], debug=False):
|
||||
self.debug = debug
|
||||
self.data_dir = data_dir
|
||||
self.langs = self.get_langs()
|
||||
self.excluded_langs = excluded_langs
|
||||
self.lang_multiModalDataset = {}
|
||||
print(
|
||||
f"[{'DEBUG MODE: ' if debug else ''}Loaded MultiNewsDataset - langs: {self.langs}]"
|
||||
)
|
||||
self.load_data()
|
||||
self.print_stats()
|
||||
|
||||
def load_data(self):
|
||||
for lang in self.langs:
|
||||
if lang not in self.excluded_langs:
|
||||
self.lang_multiModalDataset[lang] = MultiModalDataset(
|
||||
lang, join(self.data_dir, lang)
|
||||
)
|
||||
|
||||
def get_langs(self):
|
||||
from os import listdir
|
||||
|
||||
if self.debug:
|
||||
return ["it", "en"]
|
||||
|
||||
return tuple(sorted([folder for folder in listdir(self.data_dir)]))
|
||||
|
||||
def print_stats(self):
|
||||
print(f"[MultiNewsDataset stats]")
|
||||
# print(f" - langs: {self.langs}")
|
||||
total_docs = 0
|
||||
for lang in self.langs:
|
||||
_len = len(self.lang_multiModalDataset[lang].data)
|
||||
total_docs += _len
|
||||
print(
|
||||
f" - {lang} docs: {_len}\t- labels: {self._count_lang_labels(self.lang_multiModalDataset[lang].data)}"
|
||||
)
|
||||
print(f" - total docs: {total_docs}")
|
||||
|
||||
def _count_lang_labels(self, data):
|
||||
lang_labels = set()
|
||||
for sample in data:
|
||||
lang_labels.update(sample[-1])
|
||||
return len(lang_labels)
|
||||
|
||||
def export_to_torch_dataset(self, tokenizer_id):
|
||||
raise NotImplementedError
|
||||
# torch_datasets = []
|
||||
# for lang, multimodal_dataset in self.lang_multiModalDataset.keys():
|
||||
# dataset = TorchMultiNewsDataset(
|
||||
# lang=lang,
|
||||
# data=multimodal_dataset.get_docs(),
|
||||
# ids=multimodal_dataset.get_ids(),
|
||||
# imgs=multimodal_dataset.get_imgs(),
|
||||
# labels=multimodal_dataset.get_labels(),
|
||||
# tokenizer_id=tokenizer_id,
|
||||
# )
|
||||
# torch_datasets.append(dataset)
|
||||
|
||||
# raise NotImplementedError
|
||||
|
||||
def save_to_disk(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MultiModalDataset:
|
||||
def __init__(self, lang, data_dir):
|
||||
self.lang = lang
|
||||
self.data_dir = data_dir
|
||||
self.re_labels = re.compile(r"<a rel=\"tag\" href=\"\/tag\/.+?\/\">(.+?)<\/a>")
|
||||
self.re_cleaner = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
|
||||
self.re_white = re.compile(r" +")
|
||||
self.data = self.get_docs()
|
||||
|
||||
def get_docs(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_imgs(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_labels(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_ids(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_docs(self):
|
||||
data = []
|
||||
news_folder = [doc_folder for doc_folder in listdir(self.data_dir)]
|
||||
for news_folder in news_folder:
|
||||
if isdir(join(self.data_dir, news_folder)):
|
||||
fname_doc = f"text.{news_folder.split('.')[-1]}"
|
||||
with open(join(self.data_dir, news_folder, fname_doc)) as f:
|
||||
html_doc = f.read()
|
||||
img = self.get_image()
|
||||
clean_doc, labels = self.preprocess_html(html_doc)
|
||||
data.append((fname_doc, clean_doc, html_doc, img, labels))
|
||||
return data
|
||||
|
||||
def preprocess_html(self, html_doc):
|
||||
labels = self._extract_labels(html_doc)
|
||||
cleaned = self._clean_up_str(self._remove_html_tags(html_doc))
|
||||
return cleaned, labels
|
||||
|
||||
def _extract_labels(self, data):
|
||||
return re.findall(self.re_labels, data)
|
||||
|
||||
def _remove_html_tags(self, data):
|
||||
cleaned = re.sub(self.re_cleaner, "", data)
|
||||
return cleaned
|
||||
|
||||
def _clean_up_str(self, doc):
|
||||
doc = re.sub(self.re_white, " ", doc)
|
||||
doc = doc.lstrip()
|
||||
doc = doc.rstrip()
|
||||
doc = doc.replace("\n", " ")
|
||||
doc = doc.replace("\t", " ")
|
||||
return doc
|
||||
|
||||
def get_image(self):
|
||||
# TODO: implement
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from os.path import expanduser
|
||||
|
||||
_dataset_path_hardcoded = "~/datasets/MultiNews/20110730/"
|
||||
dataset = MultiNewsDataset(expanduser(_dataset_path_hardcoded), debug=True)
|
|
@ -0,0 +1,270 @@
|
|||
# TODO: this should be a instance of an abstract MultilingualDataset
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from scipy.sparse import issparse
|
||||
from os.path import join, expanduser
|
||||
import pickle
|
||||
import re
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class NewMultilingualDataset(ABC):
|
||||
@abstractmethod
|
||||
def get_training(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_validation(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_test(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def mask_numbers(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def save(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load(self):
|
||||
pass
|
||||
|
||||
|
||||
# class RcvMultilingualDataset(MultilingualDataset):
|
||||
class RcvMultilingualDataset:
|
||||
def __init__(
|
||||
self,
|
||||
run="0",
|
||||
):
|
||||
self.dataset_name = "rcv1-2"
|
||||
self.dataset_path = expanduser(
|
||||
f"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run{run}.pickle"
|
||||
)
|
||||
|
||||
def load(self):
|
||||
import pickle
|
||||
|
||||
data = pickle.load(open(self.dataset_path, "rb"))
|
||||
return self
|
||||
|
||||
|
||||
class MultilingualDataset:
|
||||
"""
|
||||
A multilingual dataset is a dictionary of training and test documents indexed by language code.
|
||||
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
|
||||
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
|
||||
labels of each document, and ids is a list of document-identifiers from the original collection.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset_name):
|
||||
self.dataset_name = dataset_name
|
||||
self.multiling_dataset = {}
|
||||
print(f"[Init Multilingual Dataset: {self.dataset_name}]")
|
||||
|
||||
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
|
||||
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
|
||||
|
||||
def save(self, file):
|
||||
self.sort_indexes()
|
||||
pickle.dump(self, open(file, "wb"), pickle.HIGHEST_PROTOCOL)
|
||||
return self
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item in self.langs():
|
||||
return self.multiling_dataset[item]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def load(cls, file):
|
||||
data = pickle.load(open(file, "rb"))
|
||||
data.sort_indexes()
|
||||
return data
|
||||
|
||||
@classmethod
|
||||
def load_ids(cls, file):
|
||||
data = pickle.load(open(file, "rb"))
|
||||
tr_ids = {
|
||||
lang: tr_ids
|
||||
for (lang, ((_, _, tr_ids), (_, _, _))) in data.multiling_dataset.items()
|
||||
}
|
||||
te_ids = {
|
||||
lang: te_ids
|
||||
for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()
|
||||
}
|
||||
return tr_ids, te_ids
|
||||
|
||||
def sort_indexes(self):
|
||||
for lang, ((Xtr, _, _), (Xte, _, _)) in self.multiling_dataset.items():
|
||||
if issparse(Xtr):
|
||||
Xtr.sort_indices()
|
||||
if issparse(Xte):
|
||||
Xte.sort_indices()
|
||||
|
||||
def set_view(self, categories=None, languages=None):
|
||||
if categories is not None:
|
||||
if isinstance(categories, int):
|
||||
categories = np.array([categories])
|
||||
elif isinstance(categories, list):
|
||||
categories = np.array(categories)
|
||||
self.categories_view = categories
|
||||
if languages is not None:
|
||||
self.languages_view = languages
|
||||
|
||||
def training(self, mask_numbers=False, target_as_csr=False):
|
||||
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
|
||||
|
||||
def test(self, mask_numbers=False, target_as_csr=False):
|
||||
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
|
||||
|
||||
def lXtr(self, mask_numbers=False):
|
||||
proc = lambda x: _mask_numbers(x) if mask_numbers else x
|
||||
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {
|
||||
lang: proc(Xtr)
|
||||
for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items()
|
||||
if lang in self.langs()
|
||||
}
|
||||
|
||||
def lXte(self, mask_numbers=False):
|
||||
proc = lambda x: _mask_numbers(x) if mask_numbers else x
|
||||
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {
|
||||
lang: proc(Xte)
|
||||
for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items()
|
||||
if lang in self.langs()
|
||||
}
|
||||
|
||||
def lYtr(self, as_csr=False):
|
||||
lY = {
|
||||
lang: self.cat_view(Ytr)
|
||||
for (lang, ((_, Ytr, _), _)) in self.multiling_dataset.items()
|
||||
if lang in self.langs()
|
||||
}
|
||||
if as_csr:
|
||||
lY = {l: csr_matrix(Y) for l, Y in lY.items()}
|
||||
return lY
|
||||
|
||||
def lYte(self, as_csr=False):
|
||||
lY = {
|
||||
lang: self.cat_view(Yte)
|
||||
for (lang, (_, (_, Yte, _))) in self.multiling_dataset.items()
|
||||
if lang in self.langs()
|
||||
}
|
||||
if as_csr:
|
||||
lY = {l: csr_matrix(Y) for l, Y in lY.items()}
|
||||
return lY
|
||||
|
||||
def cat_view(self, Y):
|
||||
if hasattr(self, "categories_view"):
|
||||
return Y[:, self.categories_view]
|
||||
else:
|
||||
return Y
|
||||
|
||||
def langs(self):
|
||||
if hasattr(self, "languages_view"):
|
||||
langs = self.languages_view
|
||||
else:
|
||||
langs = sorted(self.multiling_dataset.keys())
|
||||
return langs
|
||||
|
||||
def num_categories(self):
|
||||
return self.lYtr()[self.langs()[0]].shape[1]
|
||||
|
||||
def show_dimensions(self):
|
||||
def shape(X):
|
||||
return X.shape if hasattr(X, "shape") else len(X)
|
||||
|
||||
for lang, (
|
||||
(Xtr, Ytr, IDtr),
|
||||
(Xte, Yte, IDte),
|
||||
) in self.multiling_dataset.items():
|
||||
if lang not in self.langs():
|
||||
continue
|
||||
print(
|
||||
"Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(
|
||||
lang,
|
||||
shape(Xtr),
|
||||
self.cat_view(Ytr).shape,
|
||||
shape(Xte),
|
||||
self.cat_view(Yte).shape,
|
||||
)
|
||||
)
|
||||
|
||||
def show_category_prevalences(self):
|
||||
nC = self.num_categories()
|
||||
accum_tr = np.zeros(nC, dtype=np.int)
|
||||
accum_te = np.zeros(nC, dtype=np.int)
|
||||
in_langs = np.zeros(
|
||||
nC, dtype=np.int
|
||||
) # count languages with at least one positive example (per category)
|
||||
for lang, (
|
||||
(Xtr, Ytr, IDtr),
|
||||
(Xte, Yte, IDte),
|
||||
) in self.multiling_dataset.items():
|
||||
if lang not in self.langs():
|
||||
continue
|
||||
prev_train = np.sum(self.cat_view(Ytr), axis=0)
|
||||
prev_test = np.sum(self.cat_view(Yte), axis=0)
|
||||
accum_tr += prev_train
|
||||
accum_te += prev_test
|
||||
in_langs += (prev_train > 0) * 1
|
||||
print(lang + "-train", prev_train)
|
||||
print(lang + "-test", prev_test)
|
||||
print("all-train", accum_tr)
|
||||
print("all-test", accum_te)
|
||||
|
||||
return accum_tr, accum_te, in_langs
|
||||
|
||||
def set_labels(self, labels):
|
||||
self.labels = labels
|
||||
|
||||
def reduce_data(self, langs=["it", "en"], maxn=50):
|
||||
print(f"- Reducing data: {langs} with max {maxn} documents...")
|
||||
self.set_view(languages=langs)
|
||||
|
||||
data = {
|
||||
lang: self._reduce(data, maxn)
|
||||
for lang, data in self.multiling_dataset.items()
|
||||
if lang in langs
|
||||
}
|
||||
self.multiling_dataset = data
|
||||
return self
|
||||
|
||||
def _reduce(self, multilingual_dataset, maxn):
|
||||
new_data = []
|
||||
for split in multilingual_dataset:
|
||||
docs, labels, ids = split
|
||||
new_data.append((docs[:maxn], labels[:maxn], ids[:maxn]))
|
||||
return new_data
|
||||
|
||||
|
||||
def _mask_numbers(data):
|
||||
mask_moredigit = re.compile(r"\s[\+-]?\d{5,}([\.,]\d*)*\b")
|
||||
mask_4digit = re.compile(r"\s[\+-]?\d{4}([\.,]\d*)*\b")
|
||||
mask_3digit = re.compile(r"\s[\+-]?\d{3}([\.,]\d*)*\b")
|
||||
mask_2digit = re.compile(r"\s[\+-]?\d{2}([\.,]\d*)*\b")
|
||||
mask_1digit = re.compile(r"\s[\+-]?\d{1}([\.,]\d*)*\b")
|
||||
masked = []
|
||||
for text in tqdm(data, desc="masking numbers"):
|
||||
text = " " + text
|
||||
text = mask_moredigit.sub(" MoreDigitMask", text)
|
||||
text = mask_4digit.sub(" FourDigitMask", text)
|
||||
text = mask_3digit.sub(" ThreeDigitMask", text)
|
||||
text = mask_2digit.sub(" TwoDigitMask", text)
|
||||
text = mask_1digit.sub(" OneDigitMask", text)
|
||||
masked.append(text.replace(".", "").replace(",", "").strip())
|
||||
return masked
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
DATAPATH = expanduser(
|
||||
"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle"
|
||||
)
|
||||
print(DATAPATH)
|
||||
dataset = MultilingualDataset().load(DATAPATH)
|
||||
print(dataset.show_dimensions())
|
|
@ -0,0 +1,2 @@
|
|||
class TorchMultiNewsDataset:
|
||||
pass
|
|
@ -0,0 +1,40 @@
|
|||
from joblib import Parallel, delayed
|
||||
|
||||
from evaluation.metrics import *
|
||||
|
||||
|
||||
def evaluation_metrics(y, y_):
|
||||
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
|
||||
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
# return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_), macroP(y, y_), microP(y, y_), macroR(y, y_), microR(y, y_)
|
||||
# return macroF1(y, y_), microF1(y, y_), macroAcc(y, y_), microAcc(y, y_), macroP(y, y_), microP(y, y_), macroR(y, y_), microR(y, y_), macroAcc(y, y_)
|
||||
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
|
||||
|
||||
|
||||
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
||||
if n_jobs == 1:
|
||||
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
|
||||
else:
|
||||
langs = list(ly_true.keys())
|
||||
evals = Parallel(n_jobs=n_jobs)(
|
||||
delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs
|
||||
)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
def log_eval(l_eval, phase="training"):
|
||||
print(f"\n[Results {phase}]")
|
||||
metrics = []
|
||||
for lang in l_eval.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if phase != "validation":
|
||||
print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}")
|
||||
averages = np.mean(np.array(metrics), axis=0)
|
||||
print(
|
||||
"Averages: MF1, mF1, MK, mK",
|
||||
np.round(averages, 3),
|
||||
"\n",
|
||||
)
|
||||
return averages
|
|
@ -0,0 +1,237 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
class ContTable:
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp = tp
|
||||
self.tn = tn
|
||||
self.fp = fp
|
||||
self.fn = fn
|
||||
|
||||
def get_d(self):
|
||||
return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self):
|
||||
return self.tp + self.fn
|
||||
|
||||
def get_not_c(self):
|
||||
return self.tn + self.fp
|
||||
|
||||
def get_f(self):
|
||||
return self.tp + self.fp
|
||||
|
||||
def get_not_f(self):
|
||||
return self.tn + self.fn
|
||||
|
||||
def p_c(self):
|
||||
return (1.0 * self.get_c()) / self.get_d()
|
||||
|
||||
def p_not_c(self):
|
||||
return 1.0 - self.p_c()
|
||||
|
||||
def p_f(self):
|
||||
return (1.0 * self.get_f()) / self.get_d()
|
||||
|
||||
def p_not_f(self):
|
||||
return 1.0 - self.p_f()
|
||||
|
||||
def p_tp(self):
|
||||
return (1.0 * self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self):
|
||||
return (1.0 * self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self):
|
||||
return (1.0 * self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self):
|
||||
return (1.0 * self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0 * self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0 * self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
def __add__(self, other):
|
||||
return ContTable(
|
||||
tp=self.tp + other.tp,
|
||||
tn=self.tn + other.tn,
|
||||
fp=self.fp + other.fp,
|
||||
fn=self.fn + other.fn,
|
||||
)
|
||||
|
||||
|
||||
def accuracy(cell):
|
||||
return (cell.tp + cell.tn) * 1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
|
||||
|
||||
|
||||
def precision(cell):
|
||||
num = cell.tp
|
||||
den = cell.tp + cell.fp
|
||||
if den > 0:
|
||||
return num / den
|
||||
return 1.0
|
||||
num = cell.tn
|
||||
den = cell.tn + cell.fn
|
||||
return num / den
|
||||
|
||||
|
||||
def recall(cell):
|
||||
num = cell.tp
|
||||
den = cell.tp + cell.fn
|
||||
if den > 0:
|
||||
return num / den
|
||||
return 1.0
|
||||
num = cell.tn
|
||||
den = cell.tn + cell.fp
|
||||
return num / den
|
||||
|
||||
|
||||
def f1(cell):
|
||||
num = 2.0 * cell.tp
|
||||
den = 2.0 * cell.tp + cell.fp + cell.fn
|
||||
if den > 0:
|
||||
return num / den
|
||||
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
|
||||
def K(cell):
|
||||
specificity, recall = 0.0, 0.0
|
||||
|
||||
AN = cell.tn + cell.fp
|
||||
if AN != 0:
|
||||
specificity = cell.tn * 1.0 / AN
|
||||
|
||||
AP = cell.tp + cell.fn
|
||||
if AP != 0:
|
||||
recall = cell.tp * 1.0 / AP
|
||||
|
||||
if AP == 0:
|
||||
return 2.0 * specificity - 1.0
|
||||
elif AN == 0:
|
||||
return 2.0 * recall - 1.0
|
||||
else:
|
||||
return specificity + recall - 1.0
|
||||
|
||||
|
||||
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
|
||||
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
|
||||
def __check_consistency_and_adapt(true_labels, predictions):
|
||||
if predictions.ndim == 1:
|
||||
return __check_consistency_and_adapt(
|
||||
true_labels, np.expand_dims(predictions, axis=1)
|
||||
)
|
||||
if true_labels.ndim == 1:
|
||||
return __check_consistency_and_adapt(
|
||||
np.expand_dims(true_labels, axis=1), predictions
|
||||
)
|
||||
if true_labels.shape != predictions.shape:
|
||||
raise ValueError(
|
||||
"True and predicted label matrices shapes are inconsistent %s %s."
|
||||
% (true_labels.shape, predictions.shape)
|
||||
)
|
||||
_, nC = true_labels.shape
|
||||
return true_labels, predictions, nC
|
||||
|
||||
|
||||
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
|
||||
# probabilitiesfron with respect to the true binary labels
|
||||
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
|
||||
def soft_single_metric_statistics(true_labels, posterior_probabilities):
|
||||
assert len(true_labels) == len(
|
||||
posterior_probabilities
|
||||
), "Format not consistent between true and predicted labels."
|
||||
tp = np.sum(posterior_probabilities[true_labels == 1])
|
||||
fn = np.sum(1.0 - posterior_probabilities[true_labels == 1])
|
||||
fp = np.sum(posterior_probabilities[true_labels == 0])
|
||||
tn = np.sum(1.0 - posterior_probabilities[true_labels == 0])
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
|
||||
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
|
||||
# true_labels and predicted_labels are two vectors of shape (number_documents,)
|
||||
def hard_single_metric_statistics(true_labels, predicted_labels):
|
||||
assert len(true_labels) == len(
|
||||
predicted_labels
|
||||
), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels == 1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp + fp + fn)
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
|
||||
def macro_average(
|
||||
true_labels,
|
||||
predicted_labels,
|
||||
metric,
|
||||
metric_statistics=hard_single_metric_statistics,
|
||||
):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(
|
||||
true_labels, predicted_labels
|
||||
)
|
||||
return np.mean(
|
||||
[
|
||||
metric(metric_statistics(true_labels[:, c], predicted_labels[:, c]))
|
||||
for c in range(nC)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def micro_average(
|
||||
true_labels,
|
||||
predicted_labels,
|
||||
metric,
|
||||
metric_statistics=hard_single_metric_statistics,
|
||||
):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(
|
||||
true_labels, predicted_labels
|
||||
)
|
||||
|
||||
accum = ContTable()
|
||||
for c in range(nC):
|
||||
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
|
||||
accum = accum + other
|
||||
|
||||
return metric(accum)
|
||||
|
||||
|
||||
def macroP(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, precision)
|
||||
|
||||
|
||||
def microP(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, precision)
|
||||
|
||||
|
||||
def macroR(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, recall)
|
||||
|
||||
|
||||
def microR(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, recall)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroF1(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microF1(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroK(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, K)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microK(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, K)
|
|
@ -0,0 +1,182 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.getcwd(), "gfun"))
|
||||
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from vgfs.commons import TfidfVectorizerMultilingual
|
||||
from vgfs.learners.svms import MetaClassifier, get_learner
|
||||
from vgfs.multilingualGen import MultilingualGen
|
||||
from vgfs.transformerGen import TransformerGen
|
||||
from vgfs.vanillaFun import VanillaFunGen
|
||||
from vgfs.wceGen import WceGen
|
||||
|
||||
# TODO: save and load gfun model
|
||||
|
||||
|
||||
class GeneralizedFunnelling:
|
||||
def __init__(
|
||||
self,
|
||||
posterior,
|
||||
wce,
|
||||
multilingual,
|
||||
transformer,
|
||||
langs,
|
||||
embed_dir,
|
||||
n_jobs,
|
||||
batch_size,
|
||||
max_length,
|
||||
lr,
|
||||
epochs,
|
||||
patience,
|
||||
evaluate_step,
|
||||
transformer_name,
|
||||
):
|
||||
# Forcing VFGs -----------
|
||||
self.posteriors_vgf = posterior
|
||||
self.wce_vgf = wce
|
||||
self.multilingual_vgf = multilingual
|
||||
self.trasformer_vgf = transformer
|
||||
# ------------------------
|
||||
self.langs = langs
|
||||
self.embed_dir = embed_dir
|
||||
self.cached = True
|
||||
# Transformer VGF params
|
||||
self.transformer_name = transformer_name
|
||||
self.epochs = epochs
|
||||
self.lr_transformer = lr
|
||||
self.batch_size_transformer = batch_size
|
||||
self.max_length = max_length
|
||||
self.early_stopping = True
|
||||
self.patience = patience
|
||||
self.evaluate_step = evaluate_step
|
||||
# -------------------
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
self.n_jobs = n_jobs
|
||||
self.first_tier_learners = []
|
||||
self.metaclassifier = None
|
||||
self.aggfunc = "mean"
|
||||
self.init()
|
||||
|
||||
def init(self):
|
||||
print("[Init GeneralizedFunnelling]")
|
||||
if self.posteriors_vgf:
|
||||
fun = VanillaFunGen(
|
||||
base_learner=get_learner(calibrate=True),
|
||||
first_tier_parameters=None,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self.first_tier_learners.append(fun)
|
||||
|
||||
if self.multilingual_vgf:
|
||||
multilingual_vgf = MultilingualGen(
|
||||
embed_dir=self.embed_dir,
|
||||
langs=self.langs,
|
||||
n_jobs=self.n_jobs,
|
||||
cached=self.cached,
|
||||
probabilistic=True,
|
||||
)
|
||||
self.first_tier_learners.append(multilingual_vgf)
|
||||
|
||||
if self.wce_vgf:
|
||||
wce_vgf = WceGen(n_jobs=self.n_jobs)
|
||||
self.first_tier_learners.append(wce_vgf)
|
||||
|
||||
if self.trasformer_vgf:
|
||||
transformer_vgf = TransformerGen(
|
||||
model_name=self.transformer_name,
|
||||
lr=self.lr_transformer,
|
||||
epochs=self.epochs,
|
||||
batch_size=self.batch_size_transformer,
|
||||
max_length=self.max_length,
|
||||
device="cuda",
|
||||
print_steps=50,
|
||||
probabilistic=True,
|
||||
evaluate_step=self.evaluate_step,
|
||||
verbose=True,
|
||||
patience=self.patience,
|
||||
)
|
||||
self.first_tier_learners.append(transformer_vgf)
|
||||
|
||||
self.metaclassifier = MetaClassifier(
|
||||
meta_learner=get_learner(calibrate=True, kernel="rbf"),
|
||||
meta_parameters=get_params(),
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
def init_vgfs_vectorizers(self):
|
||||
for vgf in self.first_tier_learners:
|
||||
if isinstance(vgf, (VanillaFunGen, MultilingualGen, WceGen)):
|
||||
vgf.vectorizer = self.vectorizer
|
||||
|
||||
def fit(self, lX, lY):
|
||||
print("[Fitting GeneralizedFunnelling]")
|
||||
self.vectorizer.fit(lX)
|
||||
self.init_vgfs_vectorizers()
|
||||
|
||||
projections = []
|
||||
print("- fitting first tier learners")
|
||||
for vgf in self.first_tier_learners:
|
||||
l_posteriors = vgf.fit_transform(lX, lY)
|
||||
projections.append(l_posteriors)
|
||||
|
||||
agg = self.aggregate(projections)
|
||||
self.metaclassifier.fit(agg, lY)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
projections = []
|
||||
for vgf in self.first_tier_learners:
|
||||
l_posteriors = vgf.transform(lX)
|
||||
projections.append(l_posteriors)
|
||||
agg = self.aggregate(projections)
|
||||
l_out = self.metaclassifier.predict_proba(agg)
|
||||
return l_out
|
||||
|
||||
def fit_transform(self, lX, lY):
|
||||
return self.fit(lX, lY).transform(lX)
|
||||
|
||||
def aggregate(self, first_tier_projections):
|
||||
if self.aggfunc == "mean":
|
||||
aggregated = self._aggregate_mean(first_tier_projections)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return aggregated
|
||||
|
||||
def _aggregate_mean(self, first_tier_projections):
|
||||
# TODO: deafult dict for one-liner?
|
||||
aggregated = {
|
||||
lang: np.zeros(data.shape)
|
||||
for lang, data in first_tier_projections[0].items()
|
||||
}
|
||||
for lang_projections in first_tier_projections:
|
||||
for lang, projection in lang_projections.items():
|
||||
aggregated[lang] += projection
|
||||
|
||||
# Computing mean
|
||||
for lang, projection in aggregated.items():
|
||||
aggregated[lang] /= len(first_tier_projections)
|
||||
|
||||
return aggregated
|
||||
|
||||
def get_config(self):
|
||||
from pprint import pprint
|
||||
|
||||
# TODO
|
||||
print("[GeneralizedFunnelling config]")
|
||||
print(f"- langs: {self.langs}")
|
||||
print("-- vgfs:")
|
||||
|
||||
for vgf in self.first_tier_learners:
|
||||
pprint(vgf.get_config())
|
||||
|
||||
|
||||
def get_params(optimc=False):
|
||||
if not optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = "rbf"
|
||||
return [{"kernel": [kernel], "C": c_range, "gamma": ["auto"]}]
|
|
@ -0,0 +1,74 @@
|
|||
from sklearn.preprocessing import normalize
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _normalize(lX, l2=True):
|
||||
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX
|
||||
|
||||
|
||||
def XdotM(X, M, sif):
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
def remove_pc(X, npc=1):
|
||||
"""
|
||||
Remove the projection on the principal components
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: XX[i, :] is the data point after removing its projection
|
||||
"""
|
||||
pc = compute_pc(X, npc)
|
||||
if npc == 1:
|
||||
XX = X - X.dot(pc.transpose()) * pc
|
||||
else:
|
||||
XX = X - X.dot(pc.transpose()).dot(pc)
|
||||
return XX
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer = {
|
||||
l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs
|
||||
}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
|
||||
|
||||
def compute_pc(X, npc=1):
|
||||
"""
|
||||
Compute the principal components.
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: component_[i,:] is the i-th pc
|
||||
"""
|
||||
if isinstance(X, np.matrix):
|
||||
X = np.asarray(X)
|
||||
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
|
||||
svd.fit(X)
|
||||
return svd.components_
|
|
@ -0,0 +1,354 @@
|
|||
import time
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.preprocessing import normalize
|
||||
from sklearn.svm import SVC
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
if issparse(X) and not X.has_sorted_indices:
|
||||
X.sort_indices()
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel="linear", C=1):
|
||||
"""
|
||||
instantiate scikit Support Vector Classifier
|
||||
:param calibrate: boolean, whether to return posterior probabilities or not
|
||||
:param kernel: string,kernel to be applied to the SVC
|
||||
:param C: int or dict {'C': list of integer}, Regularization parameter
|
||||
:return: Support Vector Classifier
|
||||
"""
|
||||
return SVC(
|
||||
kernel=kernel,
|
||||
probability=calibrate,
|
||||
cache_size=1000,
|
||||
C=C,
|
||||
random_state=1,
|
||||
gamma="auto",
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
|
||||
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
|
||||
if n_jobs == 1:
|
||||
return {lang: transformer(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
transformations = Parallel(n_jobs=n_jobs)(
|
||||
delayed(transformer)(lX[lang]) for lang in langs
|
||||
)
|
||||
return {lang: transformations[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
class MonolingualClassifier:
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.learner = base_learner
|
||||
self.parameters = parameters
|
||||
self.model = None
|
||||
self.best_params_ = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y):
|
||||
tinit = time.time()
|
||||
_sort_if_sparse(X)
|
||||
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
|
||||
# multi-class format
|
||||
if len(y.shape) == 2:
|
||||
if self.parameters is not None:
|
||||
self.parameters = [
|
||||
{"estimator__" + key: params[key] for key in params.keys()}
|
||||
for params in self.parameters
|
||||
]
|
||||
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
|
||||
else:
|
||||
self.model = self.learner
|
||||
raise NotImplementedError(
|
||||
"not working as a base-classifier for funneling if there are gaps in "
|
||||
"the labels across languages"
|
||||
)
|
||||
|
||||
# parameter optimization?
|
||||
if self.parameters:
|
||||
print("debug: optimizing parameters:", self.parameters)
|
||||
self.model = GridSearchCV(
|
||||
self.model,
|
||||
param_grid=self.parameters,
|
||||
refit=True,
|
||||
cv=5,
|
||||
n_jobs=self.n_jobs,
|
||||
error_score=0,
|
||||
verbose=10,
|
||||
)
|
||||
|
||||
# print(f"-- Fitting learner on matrices X={X.shape} Y={y.shape}")
|
||||
|
||||
self.model.fit(X, y)
|
||||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
print("best parameters: ", self.best_params_)
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
assert self.model is not None, "predict called before fit"
|
||||
_sort_if_sparse(X)
|
||||
return self.model.decision_function(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
assert self.model is not None, "predict called before fit"
|
||||
assert hasattr(
|
||||
self.model, "predict_proba"
|
||||
), "the probability predictions are not enabled in this model"
|
||||
_sort_if_sparse(X)
|
||||
return self.model.predict_proba(X)
|
||||
|
||||
def predict(self, X):
|
||||
assert self.model is not None, "predict called before fit"
|
||||
_sort_if_sparse(X)
|
||||
return self.model.predict(X)
|
||||
|
||||
def best_params(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class NaivePolylingualClassifier:
|
||||
"""
|
||||
Is a mere set of independet MonolingualClassifiers
|
||||
"""
|
||||
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.base_learner = base_learner
|
||||
self.parameters = parameters
|
||||
self.model = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
trains the independent monolingual classifiers
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:param ly: a dictionary {language_label: y np.array}
|
||||
:return: self
|
||||
"""
|
||||
tinit = time.time()
|
||||
assert set(lX.keys()) == set(ly.keys()), "inconsistent language mappings in fit"
|
||||
langs = list(lX.keys())
|
||||
for lang in langs:
|
||||
_sort_if_sparse(lX[lang])
|
||||
|
||||
models = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(
|
||||
MonolingualClassifier(self.base_learner, parameters=self.parameters).fit
|
||||
)((lX[lang]), ly[lang])
|
||||
for lang in langs
|
||||
)
|
||||
|
||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||
self.empty_categories = {
|
||||
lang: self.model[lang].empty_categories for lang in langs
|
||||
}
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of classification scores for each class
|
||||
"""
|
||||
assert self.model is not None, "predict called before fit"
|
||||
assert set(lX.keys()).issubset(
|
||||
set(self.model.keys())
|
||||
), "unknown languages requested in decision function"
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs
|
||||
)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of probabilities that each document belongs to each class
|
||||
"""
|
||||
assert self.model is not None, "predict called before fit"
|
||||
assert set(lX.keys()).issubset(
|
||||
set(self.model.keys())
|
||||
), "unknown languages requested in decision function"
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
|
||||
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs
|
||||
)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of predictions
|
||||
"""
|
||||
assert self.model is not None, "predict called before fit"
|
||||
assert set(lX.keys()).issubset(
|
||||
set(self.model.keys())
|
||||
), "unknown languages requested in predict"
|
||||
if self.n_jobs == 1:
|
||||
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(self.model[lang].predict)(lX[lang]) for lang in langs
|
||||
)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def best_params(self):
|
||||
return {lang: model.best_params() for lang, model in self.model.items()}
|
||||
|
||||
|
||||
class MetaClassifier:
|
||||
def __init__(
|
||||
self,
|
||||
meta_learner,
|
||||
meta_parameters=None,
|
||||
n_jobs=-1,
|
||||
standardize_range=None,
|
||||
verbose=True,
|
||||
):
|
||||
self.n_jobs = n_jobs
|
||||
self.model = MonolingualClassifier(
|
||||
base_learner=meta_learner, parameters=meta_parameters, n_jobs=self.n_jobs
|
||||
)
|
||||
self.standardize_range = standardize_range
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, lZ, lY):
|
||||
tinit = time.time()
|
||||
Z, y = self.stack(lZ, lY)
|
||||
|
||||
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
||||
Z = self.standardizer.fit_transform(Z)
|
||||
|
||||
if self.verbose:
|
||||
print(f"- fitting the metaclassifier on data shape: {Z.shape}")
|
||||
self.model.fit(Z, y)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
def stack(self, lZ, lY=None):
|
||||
langs = list(lZ.keys())
|
||||
Z = np.vstack([lZ[lang] for lang in langs])
|
||||
if lY is not None:
|
||||
y = np.vstack([lY[lang] for lang in langs])
|
||||
return Z, y
|
||||
else:
|
||||
return Z
|
||||
|
||||
# def stack(self, lZ, lY=None):
|
||||
# X_stacked = np.vstack(list(lZ.values()))
|
||||
# if lY is not None:
|
||||
# Y_stacked = np.vstack(list(lY.values()))
|
||||
# return X_stacked, Y_stacked
|
||||
# else:
|
||||
# return X_stacked
|
||||
|
||||
def predict(self, lZ):
|
||||
lZ = _joblib_transform_multiling(
|
||||
self.standardizer.transform, lZ, n_jobs=self.n_jobs
|
||||
)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lZ):
|
||||
lZ = _joblib_transform_multiling(
|
||||
self.standardizer.transform, lZ, n_jobs=self.n_jobs
|
||||
)
|
||||
return _joblib_transform_multiling(
|
||||
self.model.predict_proba, lZ, n_jobs=self.n_jobs
|
||||
)
|
||||
|
||||
|
||||
class StandardizeTransformer:
|
||||
def __init__(self, axis=0, range=None):
|
||||
"""
|
||||
|
||||
:param axis:
|
||||
:param range:
|
||||
"""
|
||||
assert range is None or isinstance(
|
||||
range, slice
|
||||
), "wrong format for range, should either be None or a slice"
|
||||
self.axis = axis
|
||||
self.yetfit = False
|
||||
self.range = range
|
||||
|
||||
def fit(self, X):
|
||||
# print("Applying z-score standardization...")
|
||||
std = np.std(X, axis=self.axis, ddof=1)
|
||||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
if self.range is not None:
|
||||
ones = np.ones_like(self.std)
|
||||
zeros = np.zeros_like(self.mean)
|
||||
ones[self.range] = self.std[self.range]
|
||||
zeros[self.range] = self.mean[self.range]
|
||||
self.std = ones
|
||||
self.mean = zeros
|
||||
self.yetfit = True
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
if not self.yetfit:
|
||||
"transform called before fit"
|
||||
return (X - self.mean) / self.std
|
||||
|
||||
def fit_transform(self, X):
|
||||
return self.fit(X).transform(X)
|
||||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
"""
|
||||
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
|
||||
a multiclass SVM.
|
||||
"""
|
||||
|
||||
def __init__(self, verbose=True, l2=True, n_jobs=-1):
|
||||
"""
|
||||
Init the class.
|
||||
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
|
||||
:param l2: bool, whether to apply or not L2 normalization to the projection
|
||||
:param n_jobs: int, number of concurrent workers.
|
||||
"""
|
||||
# self.embedder = embedder
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(
|
||||
SVC(
|
||||
kernel="rbf",
|
||||
gamma="auto",
|
||||
probability=True,
|
||||
cache_size=1000,
|
||||
random_state=1,
|
||||
),
|
||||
n_jobs=n_jobs,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def fit(self, lX, lY):
|
||||
self.prob_classifier.fit(lX, lY)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
return lP
|
||||
|
||||
def fit_transform(self, lX, lY):
|
||||
return self.fit(lX, lY).transform(lX)
|
||||
|
||||
def predict(self, lX):
|
||||
return self.prob_classifier.predict(lX)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
return self.prob_classifier.predict_proba(lX)
|
||||
|
||||
|
||||
def _normalize(lX, l2=True):
|
||||
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX
|
|
@ -0,0 +1,176 @@
|
|||
from os.path import expanduser, join
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from torchtext.vocab import Vectors
|
||||
from joblib import Parallel, delayed
|
||||
from vgfs.viewGen import ViewGen
|
||||
from vgfs.commons import _normalize, XdotM
|
||||
from vgfs.learners.svms import FeatureSet2Posteriors
|
||||
|
||||
|
||||
class MultilingualGen(ViewGen):
|
||||
def __init__(
|
||||
self,
|
||||
cached=False,
|
||||
langs=["en", "it"],
|
||||
embed_dir="~/embeddings",
|
||||
n_jobs=-1,
|
||||
probabilistic=False,
|
||||
):
|
||||
print("- init Multilingual View Generating Function")
|
||||
self.embed_dir = embed_dir
|
||||
self.langs = langs
|
||||
self.n_jobs = n_jobs
|
||||
self.cached = cached
|
||||
self.vectorizer = None
|
||||
self.sif = True
|
||||
self.probabilistic = probabilistic
|
||||
self.fitted = False
|
||||
self._init()
|
||||
|
||||
def _init(self):
|
||||
if self.probabilistic:
|
||||
self.feature2posterior_projector = FeatureSet2Posteriors(
|
||||
n_jobs=self.n_jobs, verbose=False
|
||||
)
|
||||
|
||||
def fit(self, lX, lY):
|
||||
"""
|
||||
Fitting Multilingual View Generating Function consists in
|
||||
building/extracting the word embedding matrix for
|
||||
each language;
|
||||
"""
|
||||
print("- fitting Multilingual View Generating Function")
|
||||
self.l_vocab = self.vectorizer.vocabulary()
|
||||
self.multi_embeddings, self.langs = self._load_embeddings(
|
||||
self.embed_dir, self.cached
|
||||
)
|
||||
|
||||
if self.probabilistic:
|
||||
self.feature2posterior_projector.fit(self.transform(lX), lY)
|
||||
|
||||
self.fitted = True
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lX = self.vectorizer.transform(lX)
|
||||
|
||||
XdotMulti = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], self.multi_embeddings[lang], sif=self.sif)
|
||||
for lang in self.langs
|
||||
)
|
||||
lZ = {lang: XdotMulti[i] for i, lang in enumerate(self.langs)}
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
if self.probabilistic and self.fitted:
|
||||
lZ = self.feature2posterior_projector.transform(lZ)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, lY):
|
||||
return self.fit(lX, lY).transform(lX)
|
||||
|
||||
def _load_embeddings(self, embed_dir, cached):
|
||||
if "muse" in self.embed_dir.lower():
|
||||
multi_embeddings = load_MUSEs(
|
||||
langs=self.langs,
|
||||
l_vocab=self.vectorizer.vocabulary(),
|
||||
dir_path=embed_dir,
|
||||
cached=cached,
|
||||
)
|
||||
return multi_embeddings, sorted(multi_embeddings.keys())
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
"name": "Multilingual VGF",
|
||||
"embed_dir": self.embed_dir,
|
||||
"langs": self.langs,
|
||||
"n_jobs": self.n_jobs,
|
||||
"cached": self.cached,
|
||||
"sif": self.sif,
|
||||
"probabilistic": self.probabilistic,
|
||||
}
|
||||
|
||||
|
||||
def load_MUSEs(langs, l_vocab, dir_path, cached=False):
|
||||
dir_path = expanduser(dir_path)
|
||||
cached_dir = join(dir_path, "cached")
|
||||
nmax = 50000
|
||||
|
||||
l_embeddings = {}
|
||||
|
||||
for lang in langs:
|
||||
embed_path = f"wiki.multi.{lang}.vec"
|
||||
if cached:
|
||||
l_embeddings[lang] = Vectors(embed_path, cache=cached_dir)
|
||||
print(f"-- Loaded cached {lang} embeddings")
|
||||
else:
|
||||
(
|
||||
_embed_matrix,
|
||||
_,
|
||||
_,
|
||||
) = _load_vec(join(dir_path, embed_path), nmax)
|
||||
l_embeddings[lang] = _embed_matrix
|
||||
print(f"-- Loaded {nmax} {lang} embeddings")
|
||||
|
||||
# print("-- Extracting embeddings")
|
||||
l_embeddings = extract(l_vocab, l_embeddings)
|
||||
|
||||
return l_embeddings
|
||||
|
||||
|
||||
def _load_vec(emb_path, nmax=50000):
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
|
||||
vectors = []
|
||||
word2id = {}
|
||||
with io.open(emb_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f:
|
||||
next(f)
|
||||
for i, line in enumerate(f):
|
||||
word, vect = line.rstrip().split(" ", 1)
|
||||
vect = np.fromstring(vect, sep=" ")
|
||||
assert word not in word2id, "word found twice"
|
||||
vectors.append(vect)
|
||||
word2id[word] = len(word2id)
|
||||
if len(word2id) == nmax:
|
||||
break
|
||||
id2word = {v: k for k, v in word2id.items()}
|
||||
embeddings = np.vstack(vectors)
|
||||
return embeddings, id2word, word2id
|
||||
|
||||
|
||||
def extract(l_voc, l_embeddings):
|
||||
"""
|
||||
Reindex pretrained loaded embedding in order to match indexes
|
||||
assigned by scikit vectorizer. Such indexes are consistent with
|
||||
those used by Word Class Embeddings (since we deploy the same vectorizer)
|
||||
:param lVoc: dict {lang : {word : id}}
|
||||
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
|
||||
"""
|
||||
l_extracted = {}
|
||||
for lang, words in l_voc.items():
|
||||
source_id, target_id = reindex(words, l_embeddings[lang].stoi)
|
||||
extraction = torch.zeros((len(words), l_embeddings[lang].vectors.shape[-1]))
|
||||
extraction[source_id] = l_embeddings[lang].vectors[target_id]
|
||||
l_extracted[lang] = extraction
|
||||
return l_extracted
|
||||
|
||||
|
||||
def reindex(vectorizer_words, pretrained_word2index):
|
||||
if isinstance(vectorizer_words, dict):
|
||||
vectorizer_words = list(
|
||||
zip(*sorted(vectorizer_words.items(), key=lambda x: x[1]))
|
||||
)[0]
|
||||
|
||||
source_idx, target_idx = [], []
|
||||
for i, word in enumerate(vectorizer_words):
|
||||
if word not in pretrained_word2index:
|
||||
continue
|
||||
j = pretrained_word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
source_idx = np.asarray(source_idx)
|
||||
target_idx = np.asarray(target_idx)
|
||||
return source_idx, target_idx
|
|
@ -0,0 +1,390 @@
|
|||
import os
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import transformers
|
||||
from sklearn.model_selection import train_test_split
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
from vgfs.learners.svms import FeatureSet2Posteriors
|
||||
|
||||
from evaluation.evaluate import evaluate, log_eval
|
||||
|
||||
transformers.logging.set_verbosity_error()
|
||||
|
||||
|
||||
# TODO: early stopping, checkpointing, logging, model loading
|
||||
# TODO: experiment name
|
||||
|
||||
|
||||
class TransformerGen:
|
||||
def __init__(
|
||||
self,
|
||||
model_name,
|
||||
epochs=10,
|
||||
lr=1e-5,
|
||||
batch_size=4,
|
||||
max_length=512,
|
||||
print_steps=50,
|
||||
device="cpu",
|
||||
probabilistic=False,
|
||||
n_jobs=-1,
|
||||
evaluate_step=10,
|
||||
verbose=False,
|
||||
patience=5,
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
self.model = None
|
||||
self.lr = lr
|
||||
self.epochs = epochs
|
||||
self.tokenizer = None
|
||||
self.max_length = max_length
|
||||
self.batch_size = batch_size
|
||||
self.print_steps = print_steps
|
||||
self.probabilistic = probabilistic
|
||||
self.n_jobs = n_jobs
|
||||
self.fitted = False
|
||||
self.datasets = {}
|
||||
self.evaluate_step = evaluate_step
|
||||
self.verbose = verbose
|
||||
self.patience = patience
|
||||
self._init()
|
||||
|
||||
def _init(self):
|
||||
if self.probabilistic:
|
||||
self.feature2posterior_projector = FeatureSet2Posteriors(
|
||||
n_jobs=self.n_jobs, verbose=False
|
||||
)
|
||||
self.model_name = self._get_model_name(self.model_name)
|
||||
print(
|
||||
f"- init TransformerModel model_name: {self.model_name}, device: {self.device}]"
|
||||
)
|
||||
|
||||
def _get_model_name(self, name):
|
||||
if "bert" == name:
|
||||
name_model = "bert-base-uncased"
|
||||
elif "mbert" == name:
|
||||
name_model = "bert-base-multilingual-uncased"
|
||||
elif "xlm" == name:
|
||||
name_model = "xlm-roberta-base"
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return name_model
|
||||
|
||||
def load_pretrained_model(self, model_name, num_labels):
|
||||
return AutoModelForSequenceClassification.from_pretrained(
|
||||
model_name, num_labels=num_labels, output_hidden_states=True
|
||||
)
|
||||
|
||||
def load_tokenizer(self, model_name):
|
||||
return AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
def init_model(self, model_name, num_labels):
|
||||
return self.load_pretrained_model(model_name, num_labels), self.load_tokenizer(
|
||||
model_name
|
||||
)
|
||||
|
||||
def get_train_val_data(self, lX, lY, split=0.2, seed=42):
|
||||
tr_lX, tr_lY, val_lX, val_lY = {}, {}, {}, {}
|
||||
|
||||
for lang in lX.keys():
|
||||
tr_X, val_X, tr_Y, val_Y = train_test_split(
|
||||
lX[lang], lY[lang], test_size=split, random_state=seed, shuffle=False
|
||||
)
|
||||
tr_lX[lang] = tr_X
|
||||
tr_lY[lang] = tr_Y
|
||||
val_lX[lang] = val_X
|
||||
val_lY[lang] = val_Y
|
||||
|
||||
return tr_lX, tr_lY, val_lX, val_lY
|
||||
|
||||
def build_dataloader(self, lX, lY, batch_size, split="train", shuffle=True):
|
||||
l_tokenized = {lang: self._tokenize(data) for lang, data in lX.items()}
|
||||
self.datasets[split] = MultilingualDatasetTorch(l_tokenized, lY, split=split)
|
||||
return DataLoader(self.datasets[split], batch_size=batch_size, shuffle=shuffle)
|
||||
|
||||
def _tokenize(self, X):
|
||||
return self.tokenizer(
|
||||
X,
|
||||
return_tensors="pt",
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
max_length=self.max_length,
|
||||
)
|
||||
|
||||
def fit(self, lX, lY):
|
||||
if self.fitted:
|
||||
return self
|
||||
print("- fitting Transformer View Generating Function")
|
||||
_l = list(lX.keys())[0]
|
||||
self.num_labels = lY[_l].shape[-1]
|
||||
self.model, self.tokenizer = self.init_model(
|
||||
self.model_name, num_labels=self.num_labels
|
||||
)
|
||||
|
||||
tr_lX, tr_lY, val_lX, val_lY = self.get_train_val_data(
|
||||
lX, lY, split=0.2, seed=42
|
||||
)
|
||||
|
||||
tra_dataloader = self.build_dataloader(
|
||||
tr_lX, tr_lY, self.batch_size, split="train", shuffle=True
|
||||
)
|
||||
|
||||
val_dataloader = self.build_dataloader(
|
||||
val_lX, val_lY, self.batch_size, split="val", shuffle=False
|
||||
)
|
||||
|
||||
experiment_name = f"{self.model_name}-{self.epochs}-{self.batch_size}" # TODO: add more params
|
||||
trainer = Trainer(
|
||||
model=self.model,
|
||||
optimizer_name="adamW",
|
||||
lr=self.lr,
|
||||
device=self.device,
|
||||
loss_fn=torch.nn.CrossEntropyLoss(),
|
||||
print_steps=self.print_steps,
|
||||
evaluate_step=self.evaluate_step,
|
||||
patience=self.patience,
|
||||
experiment_name=experiment_name,
|
||||
)
|
||||
trainer.train(
|
||||
train_dataloader=tra_dataloader,
|
||||
eval_dataloader=val_dataloader,
|
||||
epochs=self.epochs,
|
||||
)
|
||||
|
||||
if self.probabilistic:
|
||||
self.feature2posterior_projector.fit(self.transform(lX), lY)
|
||||
|
||||
self.fitted = True
|
||||
|
||||
# self.save_vgf(path="models/vgf/transformers/")
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
_embeds = []
|
||||
l_embeds = defaultdict(list)
|
||||
|
||||
dataloader = self.build_dataloader(
|
||||
lX, lY=None, batch_size=self.batch_size, split="whole", shuffle=False
|
||||
)
|
||||
|
||||
self.model.eval()
|
||||
with torch.no_grad():
|
||||
for input_ids, lang in dataloader:
|
||||
input_ids = input_ids.to(self.device)
|
||||
out = self.model(input_ids).hidden_states[-1]
|
||||
batch_embeddings = out[:, 0, :].cpu().numpy()
|
||||
_embeds.append((batch_embeddings, lang))
|
||||
|
||||
for embed, lang in _embeds:
|
||||
for sample_embed, sample_lang in zip(embed, lang):
|
||||
l_embeds[sample_lang].append(sample_embed)
|
||||
|
||||
if self.probabilistic and self.fitted:
|
||||
l_embeds = self.feature2posterior_projector.transform(l_embeds)
|
||||
|
||||
return l_embeds
|
||||
|
||||
def fit_transform(self, lX, lY):
|
||||
return self.fit(lX, lY).transform(lX)
|
||||
|
||||
def save_vgf(self, path):
|
||||
print(f"- saving Transformer View Generating Function to {path}")
|
||||
return
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
"name": "Transformer VGF",
|
||||
"model_name": self.model_name,
|
||||
"max_length": self.max_length,
|
||||
"batch_size": self.batch_size,
|
||||
"lr": self.lr,
|
||||
"epochs": self.epochs,
|
||||
"device": self.device,
|
||||
"print_steps": self.print_steps,
|
||||
"evaluate_step": self.evaluate_step,
|
||||
"patience": self.patience,
|
||||
"probabilistic": self.probabilistic,
|
||||
}
|
||||
|
||||
class MultilingualDatasetTorch(Dataset):
|
||||
def __init__(self, lX, lY, split="train"):
|
||||
self.lX = lX
|
||||
self.lY = lY
|
||||
self.split = split
|
||||
self.langs = []
|
||||
self.init()
|
||||
|
||||
def init(self):
|
||||
self.X = torch.vstack([data.input_ids for data in self.lX.values()])
|
||||
if self.split != "whole":
|
||||
self.Y = torch.vstack([torch.Tensor(data) for data in self.lY.values()])
|
||||
self.langs = sum(
|
||||
[
|
||||
v
|
||||
for v in {
|
||||
lang: [lang] * len(data.input_ids) for lang, data in self.lX.items()
|
||||
}.values()
|
||||
],
|
||||
[],
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def __len__(self):
|
||||
return len(self.X)
|
||||
|
||||
def __getitem__(self, index):
|
||||
if self.split == "whole":
|
||||
return self.X[index], self.langs[index]
|
||||
return self.X[index], self.Y[index], self.langs[index]
|
||||
|
||||
|
||||
class Trainer:
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
optimizer_name,
|
||||
device,
|
||||
loss_fn,
|
||||
lr,
|
||||
print_steps,
|
||||
evaluate_step,
|
||||
patience,
|
||||
experiment_name,
|
||||
):
|
||||
self.device = device
|
||||
self.model = model.to(device)
|
||||
self.optimizer = self.init_optimizer(optimizer_name, lr)
|
||||
self.evaluate_steps = evaluate_step
|
||||
self.loss_fn = loss_fn.to(device)
|
||||
self.print_steps = print_steps
|
||||
self.earlystopping = EarlyStopping(
|
||||
patience=patience,
|
||||
checkpoint_path="models/vgfs/transformers/",
|
||||
verbose=True,
|
||||
experiment_name=experiment_name,
|
||||
)
|
||||
|
||||
def init_optimizer(self, optimizer_name, lr):
|
||||
if optimizer_name.lower() == "adamw":
|
||||
return AdamW(self.model.parameters(), lr=lr)
|
||||
else:
|
||||
raise ValueError(f"Optimizer {optimizer_name} not supported")
|
||||
|
||||
def train(self, train_dataloader, eval_dataloader, epochs=10):
|
||||
print(
|
||||
f"""- Training params:
|
||||
- epochs: {epochs}
|
||||
- learning rate: {self.optimizer.defaults['lr']}
|
||||
- train batch size: {train_dataloader.batch_size}
|
||||
- eval batch size: {'TODO'}
|
||||
- max len: {train_dataloader.dataset.X.shape[-1]}\n""",
|
||||
)
|
||||
for epoch in range(epochs):
|
||||
self.train_epoch(train_dataloader, epoch)
|
||||
if (epoch + 1) % self.evaluate_steps == 0:
|
||||
metric_watcher = self.evaluate(eval_dataloader)
|
||||
stop = self.earlystopping(metric_watcher, self.model, epoch + 1)
|
||||
if stop:
|
||||
break
|
||||
return self.model
|
||||
|
||||
def train_epoch(self, dataloader, epoch):
|
||||
self.model.train()
|
||||
for b_idx, (x, y, lang) in enumerate(dataloader):
|
||||
self.optimizer.zero_grad()
|
||||
y_hat = self.model(x.to(self.device))
|
||||
loss = self.loss_fn(y_hat.logits, y.to(self.device))
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
if b_idx % self.print_steps == 0:
|
||||
print(f"Epoch: {epoch+1} Step: {b_idx+1} Loss: {loss:.4f}")
|
||||
return self
|
||||
|
||||
def evaluate(self, dataloader):
|
||||
self.model.eval()
|
||||
|
||||
lY = defaultdict(list)
|
||||
lY_hat = defaultdict(list)
|
||||
|
||||
for b_idx, (x, y, lang) in enumerate(dataloader):
|
||||
y_hat = self.model(x.to(self.device))
|
||||
loss = self.loss_fn(y_hat.logits, y.to(self.device))
|
||||
predictions = predict(y_hat.logits, classification_type="multilabel")
|
||||
|
||||
for l, _true, _pred in zip(lang, y, predictions):
|
||||
lY[l].append(_true.detach().cpu().numpy())
|
||||
lY_hat[l].append(_pred)
|
||||
|
||||
for lang in lY:
|
||||
lY[lang] = np.vstack(lY[lang])
|
||||
lY_hat[lang] = np.vstack(lY_hat[lang])
|
||||
|
||||
l_eval = evaluate(lY, lY_hat)
|
||||
average_metrics = log_eval(l_eval, phase="validation")
|
||||
return average_metrics[0] # macro-F1
|
||||
|
||||
|
||||
class EarlyStopping:
|
||||
def __init__(
|
||||
self,
|
||||
patience=5,
|
||||
min_delta=0,
|
||||
verbose=True,
|
||||
checkpoint_path="checkpoint.pt",
|
||||
experiment_name="experiment",
|
||||
):
|
||||
self.patience = patience
|
||||
self.min_delta = min_delta
|
||||
self.counter = 0
|
||||
self.best_score = 0
|
||||
self.best_epoch = None
|
||||
self.verbose = verbose
|
||||
self.checkpoint_path = checkpoint_path
|
||||
self.experiment_name = experiment_name
|
||||
|
||||
def __call__(self, validation, model, epoch):
|
||||
if validation > self.best_score:
|
||||
print(
|
||||
f"- earlystopping: Validation score improved from {self.best_score:.3f} to {validation:.3f}"
|
||||
)
|
||||
self.best_score = validation
|
||||
self.counter = 0
|
||||
# self.save_model(model)
|
||||
elif validation < (self.best_score + self.min_delta):
|
||||
self.counter += 1
|
||||
print(
|
||||
f"- earlystopping: Validation score decreased from {self.best_score:.3f} to {validation:.3f}, current patience: {self.patience - self.counter}"
|
||||
)
|
||||
if self.counter >= self.patience:
|
||||
if self.verbose:
|
||||
print(f"- earlystopping: Early stopping at epoch {epoch}")
|
||||
return True
|
||||
|
||||
def save_model(self, model):
|
||||
_checkpoint_dir = os.path.join(self.checkpoint_path, self.experiment_name)
|
||||
print(f"- saving model to {_checkpoint_dir}")
|
||||
os.makedirs(_checkpoint_dir, exist_ok=True)
|
||||
model.save_pretrained(_checkpoint_dir)
|
||||
|
||||
|
||||
def predict(logits, classification_type="multilabel"):
|
||||
"""
|
||||
Converts soft precictions to hard predictions [0,1]
|
||||
"""
|
||||
if classification_type == "multilabel":
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == "singlelabel":
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print("unknown classification type")
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
|
@ -0,0 +1,59 @@
|
|||
from vgfs.viewGen import ViewGen
|
||||
from vgfs.learners.svms import NaivePolylingualClassifier
|
||||
from vgfs.commons import _normalize
|
||||
|
||||
|
||||
class VanillaFunGen(ViewGen):
|
||||
"""
|
||||
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
|
||||
Sebastiani in DOI: https://doi.org/10.1145/3326065
|
||||
"""
|
||||
|
||||
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
|
||||
"""
|
||||
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
|
||||
:param base_learner: naive monolingual learners to be deployed as first-tier
|
||||
learners. Should be able to return posterior probabilities.
|
||||
:param base_learner:
|
||||
:param n_jobs: integer, number of concurrent workers
|
||||
"""
|
||||
print("- init VanillaFun View Generating Function")
|
||||
self.learners = base_learner
|
||||
self.first_tier_parameters = first_tier_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.doc_projector = NaivePolylingualClassifier(
|
||||
base_learner=self.learners,
|
||||
parameters=self.first_tier_parameters,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self.vectorizer = None
|
||||
|
||||
def fit(self, lX, lY):
|
||||
print("- fitting VanillaFun View Generating Function")
|
||||
lX = self.vectorizer.transform(lX)
|
||||
self.doc_projector.fit(lX, lY)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents;
|
||||
(2) Project them according to the learners SVMs;
|
||||
(3) Apply L2 normalization to the projection and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
lZ = self.doc_projector.predict_proba(lX)
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, lY):
|
||||
return self.fit(lX, lY).transform(lX)
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
"name": "VanillaFunnelling VGF",
|
||||
"base_learner": self.learners,
|
||||
"first_tier_parameters": self.first_tier_parameters,
|
||||
"n_jobs": self.n_jobs,
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class ViewGen(ABC):
|
||||
"""
|
||||
Abstract class for View Generating Functions (VGFs) implementations. Every ViewGen should implement these three methods in order to
|
||||
be seamlessly integrated in the overall architecture.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, lX, lY):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, lX):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit_transform(self, lX, lY):
|
||||
pass
|
|
@ -0,0 +1,66 @@
|
|||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from vgfs.commons import XdotM, _normalize
|
||||
from vgfs.viewGen import ViewGen
|
||||
|
||||
|
||||
class WceGen(ViewGen):
|
||||
def __init__(self, n_jobs=-1):
|
||||
print("- init Word-Class-Embeddings View Generating Function")
|
||||
self.n_jobs = -1
|
||||
self.sif = True
|
||||
|
||||
def fit(self, lX, lY):
|
||||
print("- fitting Word-Class-Embeddings View Generating Function")
|
||||
lX = self.vectorizer.transform(lX)
|
||||
self.langs = sorted(lX.keys())
|
||||
wce = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(wce_matrix)(lX[lang], lY[lang]) for lang in self.langs
|
||||
)
|
||||
self.l_wce = {lang: wce[i] for i, lang in enumerate(self.langs)}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lX = self.vectorizer.transform(lX)
|
||||
XdotWce = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], self.l_wce[lang], sif=self.sif)
|
||||
for lang in self.langs
|
||||
)
|
||||
lZ = {l: XdotWce[i] for i, l in enumerate(self.langs)}
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, lY):
|
||||
return self.fit(lX, lY).transform(lX)
|
||||
|
||||
def get_config(self):
|
||||
return {
|
||||
"name": "Word-Class Embeddings VGF",
|
||||
"n_jobs": self.n_jobs,
|
||||
"sif": self.sif,
|
||||
}
|
||||
|
||||
|
||||
def wce_matrix(X, Y):
|
||||
wce = supervised_embeddings_tfidf(X, Y)
|
||||
wce = zscores(wce, axis=0)
|
||||
return wce
|
||||
|
||||
|
||||
def supervised_embeddings_tfidf(X, Y):
|
||||
tfidf_norm = X.sum(axis=0)
|
||||
tfidf_norm[tfidf_norm == 0] = 1
|
||||
F = (X.T).dot(Y) / tfidf_norm.T
|
||||
return np.asarray(F)
|
||||
|
||||
|
||||
def zscores(X, axis=0):
|
||||
"""
|
||||
scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
:param X:
|
||||
:param axis:
|
||||
:return:
|
||||
"""
|
||||
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
|
||||
mean = np.mean(X, axis=axis)
|
||||
return (X - mean) / std
|
|
@ -0,0 +1,128 @@
|
|||
from os.path import expanduser
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from dataManager.multiNewsDataset import MultiNewsDataset
|
||||
from dataManager.amazonDataset import AmazonDataset
|
||||
from dataManager.multilingualDatset import MultilingualDataset
|
||||
|
||||
from gfun.generalizedFunnelling import GeneralizedFunnelling
|
||||
|
||||
from evaluation.evaluate import evaluate, log_eval
|
||||
|
||||
from time import time
|
||||
import pickle
|
||||
|
||||
|
||||
# TODO: a cleaner way to save the model?
|
||||
|
||||
|
||||
def main(args):
|
||||
# Loading dataset ------------------------
|
||||
RCV_DATAPATH = expanduser(
|
||||
"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle"
|
||||
)
|
||||
# dataset = MultiNewsDataset(expanduser(args.dataset_path))
|
||||
# dataset = AmazonDataset(domains=args.domains,nrows=args.nrows,min_count=args.min_count,max_labels=args.max_labels)
|
||||
dataset = (
|
||||
MultilingualDataset(dataset_name="rcv1-2")
|
||||
.load(RCV_DATAPATH)
|
||||
.reduce_data(langs=["en", "it", "fr"], maxn=250)
|
||||
)
|
||||
|
||||
if isinstance(dataset, MultilingualDataset):
|
||||
lX, lY = dataset.training()
|
||||
lX_te, lY_te = dataset.test()
|
||||
else:
|
||||
_lX = dataset.dX
|
||||
_lY = dataset.dY
|
||||
# ----------------------------------------
|
||||
|
||||
tinit = time()
|
||||
|
||||
if args.load_pretrained is None:
|
||||
assert any(
|
||||
[
|
||||
args.posteriors,
|
||||
args.wce,
|
||||
args.multilingual,
|
||||
args.multilingual,
|
||||
args.transformer,
|
||||
]
|
||||
), "At least one of VGF must be True"
|
||||
|
||||
gfun = GeneralizedFunnelling(
|
||||
posterior=args.posteriors,
|
||||
multilingual=args.multilingual,
|
||||
wce=args.wce,
|
||||
transformer=args.transformer,
|
||||
langs=dataset.langs(),
|
||||
embed_dir="~/resources/muse_embeddings",
|
||||
n_jobs=args.n_jobs,
|
||||
max_length=args.max_length,
|
||||
batch_size=args.batch_size,
|
||||
epochs=args.epochs,
|
||||
lr=args.lr,
|
||||
patience=args.patience,
|
||||
evaluate_step=args.evaluate_step,
|
||||
transformer_name=args.transformer_name,
|
||||
)
|
||||
|
||||
gfun.get_config()
|
||||
|
||||
gfun.fit(lX, lY)
|
||||
|
||||
# Saving Model ------------------------
|
||||
with open("models/gfun/gfun_model.pkl", "wb") as f:
|
||||
print(f"- saving model to {f.name}")
|
||||
pickle.dump(gfun, f)
|
||||
# -------------------------------------
|
||||
|
||||
preds = gfun.transform(lX)
|
||||
|
||||
train_eval = evaluate(lY, preds)
|
||||
log_eval(train_eval, phase="train")
|
||||
|
||||
timetr = time()
|
||||
print(f"- training completed in {timetr - tinit:.2f} seconds")
|
||||
|
||||
# Loading Model ------------------------
|
||||
if args.load_pretrained is not None:
|
||||
with open("models/gfun/gfun_model.pkl", "rb") as f:
|
||||
print(f"- loading model from {f.name}")
|
||||
gfun = pickle.load(f)
|
||||
timetr = time()
|
||||
# --------------------------------------
|
||||
|
||||
test_eval = evaluate(lY_te, gfun.transform(lX_te))
|
||||
log_eval(test_eval, phase="test")
|
||||
|
||||
timeval = time()
|
||||
print(f"- testing completed in {timeval - timetr:.2f} seconds")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--load_pretrained", type=str, default=None)
|
||||
# Dataset parameters -------------------
|
||||
parser.add_argument("--domains", type=str, default="all")
|
||||
parser.add_argument("--nrows", type=int, default=10000)
|
||||
parser.add_argument("--min_count", type=int, default=10)
|
||||
parser.add_argument("--max_labels", type=int, default=50)
|
||||
# gFUN parameters ----------------------
|
||||
parser.add_argument("-p", "--posteriors", action="store_true")
|
||||
parser.add_argument("-m", "--multilingual", action="store_true")
|
||||
parser.add_argument("-w", "--wce", action="store_true")
|
||||
parser.add_argument("-t", "--transformer", action="store_true")
|
||||
parser.add_argument("--n_jobs", type=int, default=1)
|
||||
# transformer parameters ---------------
|
||||
parser.add_argument("--transformer_name", type=str, default="mbert")
|
||||
parser.add_argument("--batch_size", type=int, default=32)
|
||||
parser.add_argument("--epochs", type=int, default=10)
|
||||
parser.add_argument("--lr", type=float, default=1e-5)
|
||||
parser.add_argument("--max_length", type=int, default=512)
|
||||
parser.add_argument("--patience", type=int, default=5)
|
||||
parser.add_argument("--evaluate_step", type=int, default=10)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
|
@ -0,0 +1,60 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import datetime
|
||||
|
||||
|
||||
def plot_distribution(
|
||||
x,
|
||||
y,
|
||||
labels,
|
||||
title,
|
||||
figsize=(10, 5),
|
||||
logscale=False,
|
||||
notes="",
|
||||
max_labels=-1,
|
||||
save=False,
|
||||
path=None,
|
||||
):
|
||||
# sort values and labels accordingly
|
||||
y, labels = zip(*sorted(zip(y, labels), reverse=True))
|
||||
|
||||
if max_labels != -1:
|
||||
x = x[:max_labels]
|
||||
y = y[:max_labels]
|
||||
labels = labels[:max_labels]
|
||||
|
||||
plt.figure(figsize=figsize)
|
||||
plt.bar(x, y)
|
||||
plt.xticks(x, labels, rotation=90)
|
||||
|
||||
if len(notes) != 0:
|
||||
_title = f"{title} - {notes}"
|
||||
if max_labels != -1:
|
||||
_title += f" - Showing {max_labels} top labels"
|
||||
|
||||
plt.title(_title)
|
||||
|
||||
if logscale:
|
||||
plt.yscale("symlog")
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# plt.show()
|
||||
if save:
|
||||
now = datetime.datetime.now()
|
||||
path = f"{path}/{title}_{now.strftime('%m%d_%H%M')}.png"
|
||||
plt.savefig(path)
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_histogram(x, title, figsize=(10, 5), save=False, path=None):
|
||||
plt.figure(figsize=figsize)
|
||||
plt.hist(x)
|
||||
# plt.xticks(x, lables, rotation=90)
|
||||
plt.yscale("symlog")
|
||||
plt.title(title)
|
||||
# plt.show()
|
||||
if save:
|
||||
now = datetime.datetime.now()
|
||||
path = f"{path}/{title}_{now.strftime('%m%d_%H%M')}.png"
|
||||
plt.savefig(path)
|
||||
plt.close()
|
Loading…
Reference in New Issue