1
0
Fork 0

trying stuff with multilabels

This commit is contained in:
Alejandro Moreo Fernandez 2021-07-05 19:17:29 +02:00
commit a4fea89122
25 changed files with 2222 additions and 228 deletions

0
MultiLabel/data/__init__.py Executable file
View File

229
MultiLabel/data/dataset.py Executable file
View File

@ -0,0 +1,229 @@
import os,sys
from sklearn.datasets import get_data_home, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from jrcacquis_reader import fetch_jrcacquis, JRCAcquis_Document
from ohsumed_reader import fetch_ohsumed50k
from reuters21578_reader import fetch_reuters21578
from rcv_reader import fetch_RCV1
from wipo_reader import fetch_WIPOgamma, WipoGammaDocument
import pickle
import numpy as np
from tqdm import tqdm
from os.path import join
import re
def init_vectorizer():
return TfidfVectorizer(min_df=5, sublinear_tf=True)
class Dataset:
dataset_available = {'reuters21578', '20newsgroups', 'ohsumed', 'rcv1', 'ohsumed', 'jrcall',
'wipo-sl-mg','wipo-ml-mg','wipo-sl-sc','wipo-ml-sc'}
def __init__(self, name):
assert name in Dataset.dataset_available, f'dataset {name} is not available'
if name=='reuters21578':
self._load_reuters()
elif name == '20newsgroups':
self._load_20news()
elif name == 'rcv1':
self._load_rcv1()
elif name == 'ohsumed':
self._load_ohsumed()
elif name == 'jrcall':
self._load_jrc(version='all')
elif name == 'wipo-sl-mg':
self._load_wipo('singlelabel', 'maingroup')
elif name == 'wipo-ml-mg':
self._load_wipo('multilabel', 'maingroup')
elif name == 'wipo-sl-sc':
self._load_wipo('singlelabel', 'subclass')
elif name == 'wipo-ml-sc':
self._load_wipo('multilabel', 'subclass')
self.nC = self.devel_labelmatrix.shape[1]
self._vectorizer = init_vectorizer()
self._vectorizer.fit(self.devel_raw)
self.vocabulary = self._vectorizer.vocabulary_
def show(self):
nTr_docs = len(self.devel_raw)
nTe_docs = len(self.test_raw)
nfeats = len(self._vectorizer.vocabulary_)
nC = self.devel_labelmatrix.shape[1]
nD=nTr_docs+nTe_docs
print(f'{self.classification_type}, nD={nD}=({nTr_docs}+{nTe_docs}), nF={nfeats}, nC={nC}')
return self
def _load_reuters(self):
data_path = os.path.join(get_data_home(), 'reuters21578')
devel = fetch_reuters21578(subset='train', data_path=data_path)
test = fetch_reuters21578(subset='test', data_path=data_path)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_rcv1(self):
data_path = '../datasets/RCV1-v2/unprocessed_corpus' #TODO: check when missing
devel = fetch_RCV1(subset='train', data_path=data_path)
test = fetch_RCV1(subset='test', data_path=data_path)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_jrc(self, version):
assert version in ['300','all'], 'allowed versions are "300" or "all"'
data_path = "../datasets/JRC_Acquis_v3"
tr_years=list(range(1986, 2006))
te_years=[2006]
if version=='300':
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1,most_frequent=300)
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
else:
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1)
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
print(f'load jrc-acquis (English) with {len(tr_cats)} tr categories ({len(te_cats)} te categories)')
devel_data = JRCAcquis_Document.get_text(training_docs)
test_data = JRCAcquis_Document.get_text(test_docs)
devel_target = JRCAcquis_Document.get_target(training_docs)
test_target = JRCAcquis_Document.get_target(test_docs)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_ohsumed(self):
data_path = os.path.join(get_data_home(), 'ohsumed50k')
devel = fetch_ohsumed50k(subset='train', data_path=data_path)
test = fetch_ohsumed50k(subset='test', data_path=data_path)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_20news(self):
metadata = ('headers', 'footers', 'quotes')
devel = fetch_20newsgroups(subset='train', remove=metadata)
test = fetch_20newsgroups(subset='test', remove=metadata)
self.classification_type = 'singlelabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_target, self.test_target = devel.target, test.target
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1,1), self.test_target.reshape(-1,1))
def _load_fasttext_data(self,name):
data_path='../datasets/fastText'
self.classification_type = 'singlelabel'
name=name.replace('-','_')
train_file = join(data_path,f'{name}.train')
assert os.path.exists(train_file), f'file {name} not found, please place the fasttext data in {data_path}' #' or specify the path' #todo
self.devel_raw, self.devel_target = load_fasttext_format(train_file)
self.test_raw, self.test_target = load_fasttext_format(join(data_path, f'{name}.test'))
self.devel_raw = mask_numbers(self.devel_raw)
self.test_raw = mask_numbers(self.test_raw)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
def _load_wipo(self, classmode, classlevel):
assert classmode in {'singlelabel', 'multilabel'}, 'available class_mode are sl (single-label) or ml (multi-label)'
data_path = '../datasets/WIPO/wipo-gamma/en'
data_proc = '../datasets/WIPO-extracted'
devel = fetch_WIPOgamma(subset='train', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
test = fetch_WIPOgamma(subset='test', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
devel_data = [d.text for d in devel]
test_data = [d.text for d in test]
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
self.classification_type = classmode
if classmode== 'multilabel':
devel_target = [d.all_labels for d in devel]
test_target = [d.all_labels for d in test]
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
else:
devel_target = [d.main_label for d in devel]
test_target = [d.main_label for d in test]
# only for labels with at least one training document
class_id = {labelname:index for index,labelname in enumerate(sorted(set(devel_target)))}
devel_target = np.array([class_id[id] for id in devel_target]).astype(int)
test_target = np.array([class_id.get(id,None) for id in test_target])
if None in test_target:
print(f'deleting {(test_target==None).sum()} test documents without valid categories')
keep_pos = test_target!=None
self.test_raw = (np.asarray(self.test_raw)[keep_pos]).tolist()
test_target = test_target[keep_pos]
test_target=test_target.astype(int)
self.devel_target, self.test_target = devel_target, test_target
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
def vectorize(self):
if not hasattr(self, 'Xtr') or not hasattr(self, 'Xte'):
self.Xtr = self._vectorizer.transform(self.devel_raw)
self.Xte = self._vectorizer.transform(self.test_raw)
self.Xtr.sort_indices()
self.Xte.sort_indices()
return self.Xtr, self.Xte
def analyzer(self):
return self._vectorizer.build_analyzer()
@classmethod
def load(cls, dataset_name, pickle_path=None):
if pickle_path:
if os.path.exists(pickle_path):
print(f'loading pickled dataset from {pickle_path}')
dataset = pickle.load(open(pickle_path, 'rb'))
else:
print(f'fetching dataset and dumping it into {pickle_path}')
dataset = Dataset(name=dataset_name)
print('vectorizing for faster processing')
dataset.vectorize()
print('dumping')
pickle.dump(dataset, open(pickle_path, 'wb', pickle.HIGHEST_PROTOCOL))
else:
print(f'loading dataset {dataset_name}')
dataset = Dataset(name=dataset_name)
print('[Done]')
return dataset
def _label_matrix(tr_target, te_target):
mlb = MultiLabelBinarizer(sparse_output=True)
ytr = mlb.fit_transform(tr_target)
yte = mlb.transform(te_target)
print(mlb.classes_)
return ytr, yte
def load_fasttext_format(path):
print(f'loading {path}')
labels,docs=[],[]
for line in tqdm(open(path, 'rt').readlines()):
space = line.strip().find(' ')
label = int(line[:space].replace('__label__',''))-1
labels.append(label)
docs.append(line[space+1:])
labels=np.asarray(labels,dtype=int)
return docs,labels
def mask_numbers(data, number_mask='numbermask'):
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked

View File

@ -0,0 +1,263 @@
import os, sys
from os.path import join
import tarfile
import xml.etree.ElementTree as ET
from sklearn.datasets import get_data_home
import pickle
import rdflib
from rdflib.namespace import RDF, SKOS
from rdflib import URIRef
import zipfile
from collections import Counter
from tqdm import tqdm
from random import shuffle
from util.file import *
class JRCAcquis_Document:
def __init__(self, id, name, lang, year, head, body, categories):
self.id = id
self.parallel_id = name
self.lang = lang
self.year = year
self.text = body if not head else head + "\n" + body
self.categories = categories
@classmethod
def get_text(cls, jrc_documents):
return [d.text for d in jrc_documents]
@classmethod
def get_target(cls, jrc_documents):
return [d.categories for d in jrc_documents]
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
for ch in ['a','e','i','o','u']:
text = text.replace('%'+ch+'acute%',ch)
return text
def parse_document(file, year, head=False):
root = ET.parse(file).getroot()
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
doc_lang = root.attrib['lang'] # e.g., 'es'
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
def raise_if_empty(field, from_file):
if isinstance(field, str):
if not field.strip():
raise ValueError("Empty field in file %s" % from_file)
raise_if_empty(doc_name, file)
raise_if_empty(doc_lang, file)
raise_if_empty(doc_id, file)
if head: raise_if_empty(doc_head, file)
raise_if_empty(doc_body, file)
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
if not isinstance(cat_filter, frozenset):
cat_filter = frozenset(cat_filter)
filtered = []
for doc in doclist:
doc.categories = list(cat_filter & set(doc.categories))
if doc.categories:
doc.categories.sort()
filtered.append(doc)
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
return filtered
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
def _get_categories(request):
final_cats = set()
for d in request:
final_cats.update(d.categories)
return list(final_cats)
def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True,
cat_filter=None, cat_threshold=0, most_frequent=-1,
DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
if not data_path:
data_path = get_data_home()
if not os.path.exists(data_path):
os.mkdir(data_path)
request = []
total_read = 0
file_name = 'jrc-' + lang + '.tgz'
archive_path = join(data_path, file_name)
if not os.path.exists(archive_path):
print("downloading language-specific dataset (once and for all) into %s" % data_path)
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
download_file(DOWNLOAD_URL, archive_path)
print("untarring dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
documents_dir = join(data_path, lang)
print("Reading documents...")
read = 0
for dir in list_dirs(documents_dir):
year = int(dir)
if years==None or year in years:
year_dir = join(documents_dir,dir)
l_y_documents = []
all_documents = list_files(year_dir)
empty = 0
pbar = tqdm(enumerate(all_documents))
for i,doc_file in pbar:
try:
jrc_doc = parse_document(join(year_dir, doc_file), year)
except ValueError:
jrc_doc = None
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
l_y_documents.append(jrc_doc)
else: empty += 1
read+=1
pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields')
request += l_y_documents
print("Read %d documents for language %s\n" % (read, lang))
total_read += read
final_cats = _get_categories(request)
if cat_filter:
request = _filter_by_category(request, cat_filter)
final_cats = _get_categories(request)
if cat_threshold > 0:
request, final_cats = _filter_by_frequency(request, cat_threshold)
if most_frequent != -1 and len(final_cats) > most_frequent:
request, final_cats = _most_common(request, most_frequent)
return request, final_cats
def print_cat_analysis(request):
cat_count = Counter()
for d in request:
cat_count.update(d.categories)
print("Number of active categories: {}".format(len(cat_count)))
print(cat_count.most_common())
# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
select="broadest"):
fullpath_pickle = join(data_path, select+'_concepts.pickle')
if os.path.exists(fullpath_pickle):
print("Pickled object found in %s. Loading it." % fullpath_pickle)
return pickle.load(open(fullpath_pickle,'rb'))
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
if not os.path.exists(fullpath):
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
download_file(eurovoc_url, fullpath)
print("Unzipping file...")
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
zipped.close()
print("Parsing %s" %fullpath)
g = rdflib.Graph()
g.parse(location=fullpath, format="application/rdf+xml")
if select == "all":
print("Selecting all concepts")
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
all_concepts.sort()
selected_concepts = all_concepts
elif select=="broadest":
print("Selecting broadest concepts (those without any other broader concept linked to it)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
narrower_concepts = set(g.subjects(SKOS.broader, None))
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
broadest_concepts.sort()
selected_concepts = broadest_concepts
elif select=="leaves":
print("Selecting leaves concepts (those not linked as broader of any other concept)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
broad_concepts = set(g.objects(None, SKOS.broader))
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
leave_concepts.sort()
selected_concepts = leave_concepts
else:
raise ValueError("Selection policy %s is not currently supported" % select)
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
return selected_concepts
if __name__ == '__main__':
# example code
train_years = list(range(1986, 2006))
test_years = [2006]
cat_policy = 'all' #'leaves'
most_common_cat = 300
JRC_DATAPATH = "../datasets/JRC_Acquis_v3"
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years,
cat_filter=None, cat_threshold=1,
most_frequent=most_common_cat)
test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years,
cat_filter=tr_cats, cat_threshold=1)
# training_cats = jrc_get_categories(training_docs)
# test_cats = jrc_get_categories(test_docs)
# intersection_cats = [c for c in training_cats if c in test_cats]
# training_docs = jrc_filter_by_category(training_docs, intersection_cats)
# test_docs = jrc_filter_by_category(test_docs, intersection_cats)
print(f'JRC-train: {len(training_docs)} documents')
print(f'JRC-test: {len(test_docs)} documents')
print_cat_analysis(training_docs)
print_cat_analysis(test_docs)
"""
JRC-train: 12615 documents, 300 cats
JRC-test: 7055 documents, 300 cats
"""

5
MultiLabel/data/labeled.py Executable file
View File

@ -0,0 +1,5 @@
class LabelledDocuments:
def __init__(self, data, target, target_names):
self.data=data
self.target=target
self.target_names=target_names

View File

@ -0,0 +1,63 @@
import os
import pickle
import tarfile
from os.path import join
import urllib.request
from data.labeled import LabelledDocuments
from util.file import create_if_not_exist, download_file_if_not_exists
import math
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7):
_dataname = 'ohsumed50k'
if data_path is None:
data_path = join(os.path.expanduser('~'), _dataname)
create_if_not_exist(data_path)
pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle')
if not os.path.exists(pickle_file):
DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz')
archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz')
download_file_if_not_exists(DOWNLOAD_URL, archive_path)
untardir = 'ohsumed-all'
if not os.path.exists(os.path.join(data_path, untardir)):
print("untarring ohsumed...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
target_names = []
doc_classes = dict()
class_docs = dict()
content = dict()
doc_ids = set()
for cat_id in os.listdir(join(data_path, untardir)):
target_names.append(cat_id)
class_docs[cat_id] = []
for doc_id in os.listdir(join(data_path, untardir, cat_id)):
doc_ids.add(doc_id)
text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read()
if doc_id not in doc_classes: doc_classes[doc_id] = []
doc_classes[doc_id].append(cat_id)
if doc_id not in content: content[doc_id] = text_content
class_docs[cat_id].append(doc_id)
target_names.sort()
print('Read %d different documents' % len(doc_ids))
splitdata = dict({'train': [], 'test': []})
for cat_id in target_names:
free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])]
if len(free_docs) > 0:
split_point = int(math.floor(len(free_docs) * train_test_split))
splitdata['train'].extend(free_docs[:split_point])
splitdata['test'].extend(free_docs[split_point:])
for split in ['train', 'test']:
dataset = LabelledDocuments([], [], target_names)
for doc_id in splitdata[split]:
dataset.data.append(content[doc_id])
dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]])
pickle.dump(dataset,
open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
print(pickle_file)
return pickle.load(open(pickle_file, 'rb'))

152
MultiLabel/data/rcv_reader.py Executable file
View File

@ -0,0 +1,152 @@
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from data.labeled import LabelledDocuments
from util.file import list_files
from os.path import join, exists
from util.file import download_file_if_not_exists
import re
from collections import Counter
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
'lyrl2004_tokens_test_pt1.dat.gz',
'lyrl2004_tokens_test_pt2.dat.gz',
'lyrl2004_tokens_test_pt3.dat.gz']
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
class RCV_Document:
def __init__(self, id, text, categories, date=''):
self.id = id
self.date = date
self.text = text
self.categories = categories
class IDRangeException(Exception): pass
nwords = []
def parse_document(xml_content, valid_id_range=None):
root = ET.fromstring(xml_content)
doc_id = root.attrib['itemid']
if valid_id_range is not None:
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
raise IDRangeException
doc_categories = [cat.attrib['code'] for cat in
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
doc_date = root.attrib['date']
doc_title = root.find('.//title').text
doc_headline = root.find('.//headline').text
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
if not doc_body:
raise ValueError('Empty document')
if doc_title is None: doc_title = ''
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date)
def fetch_RCV1(data_path, subset='all'):
assert subset in ['train', 'test', 'all'], 'split should either be "train", "test", or "all"'
request = []
labels = set()
read_documents = 0
training_documents = 23149
test_documents = 781265
if subset == 'all':
split_range = (2286, 810596)
expected = training_documents+test_documents
elif subset == 'train':
split_range = (2286, 26150)
expected = training_documents
else:
split_range = (26151, 810596)
expected = test_documents
# global nwords
# nwords=[]
for part in list_files(data_path):
if not re.match('\d+\.zip', part): continue
target_file = join(data_path, part)
assert exists(target_file), \
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
zipfile = ZipFile(target_file)
for xmlfile in zipfile.namelist():
xmlcontent = zipfile.open(xmlfile).read()
try:
doc = parse_document(xmlcontent, valid_id_range=split_range)
labels.update(doc.categories)
request.append(doc)
read_documents += 1
except (IDRangeException,ValueError) as e:
pass
print('\r[{}] read {} documents'.format(part, len(request)), end='')
if read_documents == expected: break
if read_documents == expected: break
print()
# print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
return LabelledDocuments(data=[d.text for d in request], target=[d.categories for d in request], target_names=list(labels))
def fetch_topic_hierarchy(path, topics='all'):
assert topics in ['all', 'leaves']
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
hierarchy = {}
for line in open(path, 'rt'):
parts = line.strip().split()
parent,child = parts[1],parts[3]
if parent not in hierarchy:
hierarchy[parent]=[]
hierarchy[parent].append(child)
del hierarchy['None']
del hierarchy['Root']
print(hierarchy)
if topics=='all':
topics = set(hierarchy.keys())
for parent in hierarchy.keys():
topics.update(hierarchy[parent])
return list(topics)
elif topics=='leaves':
parents = set(hierarchy.keys())
childs = set()
for parent in hierarchy.keys():
childs.update(hierarchy[parent])
return list(childs.difference(parents))
if __name__=='__main__':
# example
RCV1_PATH = '../../datasets/RCV1-v2/unprocessed_corpus'
rcv1_train = fetch_RCV1(RCV1_PATH, subset='train')
rcv1_test = fetch_RCV1(RCV1_PATH, subset='test')
print('read {} documents in rcv1-train, and {} labels'.format(len(rcv1_train.data), len(rcv1_train.target_names)))
print('read {} documents in rcv1-test, and {} labels'.format(len(rcv1_test.data), len(rcv1_test.target_names)))
cats = Counter()
for cats in rcv1_train.target: cats.update(cats)
print('RCV1', cats)

View File

@ -0,0 +1,189 @@
# Modified version of the code originally implemented by Eustache Diemert <eustache@diemert.fr>
# @FedericoV <https://github.com/FedericoV/>
# with License: BSD 3 clause
import os.path
import re
import tarfile
from sklearn.datasets import get_data_home
from six.moves import html_parser
from six.moves import urllib
import pickle
from glob import glob
import numpy as np
from data.labeled import LabelledDocuments
def _not_in_sphinx():
# Hack to detect whether we are running by the sphinx builder
return '__file__' in globals()
class ReutersParser(html_parser.HTMLParser):
"""Utility class to parse a SGML file and yield documents one at a time."""
def __init__(self, encoding='latin-1', data_path=None):
self.data_path = data_path
self.download_if_not_exist()
self.tr_docs = []
self.te_docs = []
html_parser.HTMLParser.__init__(self)
self._reset()
self.encoding = encoding
self.empty_docs = 0
def handle_starttag(self, tag, attrs):
method = 'start_' + tag
getattr(self, method, lambda x: None)(attrs)
def handle_endtag(self, tag):
method = 'end_' + tag
getattr(self, method, lambda: None)()
def _reset(self):
self.in_title = 0
self.in_body = 0
self.in_topics = 0
self.in_topic_d = 0
self.in_unproc_text = 0
self.title = ""
self.body = ""
self.topics = []
self.topic_d = ""
self.text = ""
def parse(self, fd):
for chunk in fd:
self.feed(chunk.decode(self.encoding))
self.close()
def handle_data(self, data):
if self.in_body:
self.body += data
elif self.in_title:
self.title += data
elif self.in_topic_d:
self.topic_d += data
elif self.in_unproc_text:
self.text += data
def start_reuters(self, attributes):
topic_attr = attributes[0][1]
lewissplit_attr = attributes[1][1]
self.lewissplit = u'unused'
if topic_attr==u'YES':
if lewissplit_attr == u'TRAIN':
self.lewissplit = 'train'
elif lewissplit_attr == u'TEST':
self.lewissplit = 'test'
pass
def end_reuters(self):
self.body = re.sub(r'\s+', r' ', self.body)
if self.lewissplit != u'unused':
parsed_doc = {'title': self.title, 'body': self.body, 'unproc':self.text, 'topics': self.topics}
if (self.title+self.body+self.text).strip() == '':
self.empty_docs += 1
if self.lewissplit == u'train':
self.tr_docs.append(parsed_doc)
elif self.lewissplit == u'test':
self.te_docs.append(parsed_doc)
self._reset()
def start_title(self, attributes):
self.in_title = 1
def end_title(self):
self.in_title = 0
def start_body(self, attributes):
self.in_body = 1
def end_body(self):
self.in_body = 0
def start_topics(self, attributes):
self.in_topics = 1
def end_topics(self):
self.in_topics = 0
def start_text(self, attributes):
if len(attributes)>0 and attributes[0][1] == u'UNPROC':
self.in_unproc_text = 1
def end_text(self):
self.in_unproc_text = 0
def start_d(self, attributes):
self.in_topic_d = 1
def end_d(self):
if self.in_topics:
self.topics.append(self.topic_d)
self.in_topic_d = 0
self.topic_d = ""
def download_if_not_exist(self):
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
'reuters21578-mld/reuters21578.tar.gz')
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
if self.data_path is None:
self.data_path = os.path.join(get_data_home(), "reuters")
if not os.path.exists(self.data_path):
"""Download the dataset."""
print("downloading dataset (once and for all) into %s" % self.data_path)
os.mkdir(self.data_path)
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
if _not_in_sphinx():
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
archive_path = os.path.join(self.data_path, ARCHIVE_FILENAME)
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
reporthook=progress)
if _not_in_sphinx():
print('\r', end='')
print("untarring Reuters dataset...")
tarfile.open(archive_path, 'r:gz').extractall(self.data_path)
print("done.")
def fetch_reuters21578(data_path=None, subset='train'):
if data_path is None:
data_path = os.path.join(get_data_home(), 'reuters21578')
reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle")
if not os.path.exists(reuters_pickle_path):
parser = ReutersParser(data_path=data_path)
for filename in glob(os.path.join(data_path, "*.sgm")):
parser.parse(open(filename, 'rb'))
# index category names with a unique numerical code (only considering categories with training examples)
tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist()
def pickle_documents(docs, subset):
for doc in docs:
doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories]
pickle_docs = {'categories': tr_categories, 'documents': docs}
pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
return pickle_docs
pickle_tr = pickle_documents(parser.tr_docs, "train")
pickle_te = pickle_documents(parser.te_docs, "test")
# self.sout('Empty docs %d' % parser.empty_docs)
requested_subset = pickle_tr if subset == 'train' else pickle_te
else:
requested_subset = pickle.load(open(reuters_pickle_path, 'rb'))
data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']]
text_data, topics = zip(*data)
return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories'])
if __name__=='__main__':
reuters_train = fetch_reuters21578(subset='train')
print(reuters_train.data)

280
MultiLabel/data/tsr_function__.py Executable file
View File

@ -0,0 +1,280 @@
import math
import numpy as np
from scipy.stats import t
from scipy.stats import norm
from joblib import Parallel, delayed
import time
from scipy.sparse import csr_matrix, csc_matrix
STWFUNCTIONS = ['dotn', 'ppmi', 'ig', 'chi2', 'cw', 'wp']
def get_probs(tpr, fpr, pc):
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
pnc = 1.0 - pc
tp = tpr * pc
fn = pc - tp
fp = fpr * pnc
tn = pnc - fp
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
def apply_tsr(tpr, fpr, pc, tsr):
cell = get_probs(tpr, fpr, pc)
return tsr(cell)
def positive_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0.0
else:
return information_gain(cell)
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def information_gain_mod(cell):
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gain_ratio(cell):
pc = cell.p_c()
pnc = 1.0 - pc
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
return information_gain(cell) / (-norm)
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def relevance_frequency(cell):
a = cell.tp
c = cell.fp
if c == 0: c = 1
return math.log(2.0 + (a * 1.0 / c), 2)
def idf(cell):
if cell.p_f()>0:
return math.log(1.0 / cell.p_f())
return 0.0
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus;
def word_prob(cell):
return cell.tpr()
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
print(f'[selectiong {k} terms]')
nC = Y.shape[1]
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
best_features_idx = np.argsort(-FC, axis=0).flatten()
tsr_values = FC.flatten()
selected_indexes_set = set()
selected_indexes = list()
selected_value = list()
from_category = list()
round_robin = iter(best_features_idx)
values_iter = iter(tsr_values)
round=0
while len(selected_indexes) < k:
term_idx = next(round_robin)
term_val = next(values_iter)
if term_idx not in selected_indexes_set:
selected_indexes_set.add(term_idx)
selected_indexes.append(term_idx)
selected_value.append(term_val)
from_category.append(round)
round = (round + 1) % nC
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
return np.array(cell_matrix)
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC,nF = cell_matrix.shape
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
take as input any real-valued feature column (e.g., tf-idf weights).
feat is the feature vector, and c is a binary classification vector.
This implementation covers only the binary case, while the formula is defined for multiclass
single-label scenarios, for which the version [2] might be preferred.
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
"""
def fisher_score_binary(feat, c):
neg = np.ones_like(c) - c
npos = np.sum(c)
nneg = np.sum(neg)
mupos = np.mean(feat[c == 1])
muneg = np.mean(feat[neg == 1])
mu = np.mean(feat)
stdpos = np.std(feat[c == 1])
stdneg = np.std(feat[neg == 1])
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
if den>0:
return num / den
else:
return num

212
MultiLabel/data/wipo_reader.py Executable file
View File

@ -0,0 +1,212 @@
#https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/
import os, sys
from os.path import exists, join
from util.file import *
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np
import pickle
from joblib import Parallel, delayed
WIPO_URL= 'https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/'
class WipoGammaDocument:
def __init__(self, id, text, main_label, all_labels):
self.id = id
self.text = text
self.main_label = main_label
self.all_labels = all_labels
def remove_nested_claimtext_tags(xmlcontent):
from_pos = xmlcontent.find(b'<claims')
to_pos = xmlcontent.find(b'</claims>')
if from_pos > -1 and to_pos > -1:
in_between = xmlcontent[from_pos:to_pos].replace(b'<claim-text>',b'').replace(b'</claim-text>',b'')
xmlcontent = (xmlcontent[:from_pos]+in_between+xmlcontent[to_pos:]).strip()
return xmlcontent
def parse_document(xml_content, text_fields, limit_description):
root = ET.fromstring(remove_nested_claimtext_tags(xml_content))
doc_id = root.attrib['ucid']
lang = root.attrib['lang']
#take categories from the categorization up the "sub-class" level
main_group = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="true"]'))
sec_groups = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="false"]'))
sec_groups.update(main_group)
assert len(main_group) == 1, 'more than one main groups'
main_group = list(main_group)[0]
sec_groups = sorted(list(sec_groups))
assert lang == 'EN', f'only English documents allowed (doc {doc_id})'
doc_text_fields=[]
if 'abstract' in text_fields:
abstract = '\n'.join(filter(None, [t.text for t in root.findall('.//abstract[@lang="EN"]/p')]))
doc_text_fields.append(abstract)
if 'description' in text_fields:
description = '\n'.join(filter(None, [t.text for t in root.findall('.//description[@lang="EN"]/p')]))
if limit_description>-1:
description=' '.join(description.split()[:limit_description])
doc_text_fields.append(description)
if 'claims' in text_fields:
claims = '\n'.join(filter(None, [t.text for t in root.findall('.//claims[@lang="EN"]/claim')]))
doc_text_fields.append(claims)
text = '\n'.join(doc_text_fields)
if text:
return WipoGammaDocument(doc_id, text, main_group, sec_groups)
else:
return None
def extract(fin, fout, text_fields, limit_description):
zipfile = ZipFile(fin)
ndocs=0
with open(fout, 'wt') as out:
for xmlfile in tqdm(zipfile.namelist()):
if xmlfile.endswith('.xml'):
xmlcontent = zipfile.open(xmlfile).read()
document = parse_document(xmlcontent, text_fields, limit_description)
if document:
line_text = document.text.replace('\n', ' ').replace('\t', ' ').strip()
assert line_text, f'empty document in {xmlfile}'
all_labels = ' '.join(document.all_labels)
out.write('\t'.join([document.id, document.main_label, all_labels, line_text]))
out.write('\n')
ndocs+=1
out.flush()
def read_classification_file(data_path, classification_level):
assert classification_level in ['subclass', 'maingroup'], 'wrong classification requested'
z = ZipFile(join(data_path,'EnglishWipoGamma1.zip'))
inpath='Wipo_Gamma/English/TrainTestSpits'
document_labels = dict()
train_ids, test_ids = set(), set()
labelcut = LabelCut(classification_level)
for subset in tqdm(['train', 'test'], desc='loading classification file'):
target_subset = train_ids if subset=='train' else test_ids
if classification_level == 'subclass':
file = f'{subset}set_en_sc.parts' #sub-class level
else:
file = f'{subset}set_en_mg.parts' #main-group level
for line in z.open(f'{inpath}/{file}').readlines():
line = line.decode().strip().split(',')
id = line[0]
id = id[id.rfind('/')+1:].replace('.xml','')
labels = labelcut.trim(line[1:])
document_labels[id]=labels
target_subset.add(id)
return document_labels, train_ids, test_ids
class LabelCut:
"""
Labels consists of 1 char for section, 2 chars for class, 1 class for subclass, 2 chars for maingroup and so on.
This class cuts the label at a desired level (4 for subclass, or 6 for maingroup)
"""
def __init__(self, classification_level):
assert classification_level in {'subclass','maingroup'}, 'unknown classification level'
if classification_level == 'subclass': self.cut = 4
else: self.cut = 6
def trim(self, label):
if isinstance(label, list):
return sorted(set([l[:self.cut] for l in label]))
else:
return label[:self.cut]
def fetch_WIPOgamma(subset, classification_level, data_home, extracted_path, text_fields = ['abstract', 'description'], limit_description=300):
"""
Fetchs the WIPO-gamma dataset
:param subset: 'train' or 'test' split
:param classification_level: the classification level, either 'subclass' or 'maingroup'
:param data_home: directory containing the original 11 English zips
:param extracted_path: directory used to extract and process the original files
:param text_fields: indicates the fields to extract, in 'abstract', 'description', 'claims'
:param limit_description: the maximum number of words to take from the description field (default 300); set to -1 for all
:return:
"""
assert subset in {"train", "test"}, 'unknown target request (valid ones are "train" or "test")'
assert len(text_fields)>0, 'at least some text field should be indicated'
if not exists(data_home):
raise ValueError(f'{data_home} does not exist, and the dataset cannot be automatically download, '
f'since you need to request for permission. Please refer to {WIPO_URL}')
create_if_not_exist(extracted_path)
config = f'{"-".join(text_fields)}'
if 'description' in text_fields: config+='-{limit_description}'
pickle_path=join(extracted_path, f'wipo-{subset}-{classification_level}-{config}.pickle')
if exists(pickle_path):
print(f'loading pickled file in {pickle_path}')
return pickle.load(open(pickle_path,'rb'))
print('pickle file not found, processing...(this will take some minutes)')
extracted = sum([exists(f'{extracted_path}/EnglishWipoGamma{(i+1)}-{config}.txt') for i in range(11)])==11
if not extracted:
print(f'extraction files not found, extracting files in {data_home}... (this will take some additional minutes)')
Parallel(n_jobs=-1)(
delayed(extract)(
join(data_home, file), join(extracted_path, file.replace('.zip', f'-{config}.txt')), text_fields, limit_description
)
for file in list_files(data_home)
)
doc_labels, train_ids, test_ids = read_classification_file(data_home, classification_level=classification_level) # or maingroup
print(f'{len(doc_labels)} documents classified split in {len(train_ids)} train and {len(test_ids)} test documents')
train_request = []
test_request = []
pbar = tqdm([filename for filename in list_files(extracted_path) if filename.endswith(f'-{config}.txt')])
labelcut = LabelCut(classification_level)
errors=0
for proc_file in pbar:
pbar.set_description(f'processing {proc_file} [errors={errors}]')
if not proc_file.endswith(f'-{config}.txt'): continue
lines = open(f'{extracted_path}/{proc_file}', 'rt').readlines()
for lineno,line in enumerate(lines):
parts = line.split('\t')
assert len(parts)==4, f'wrong format in {extracted_path}/{proc_file} line {lineno}'
id,mainlabel,alllabels,text=parts
mainlabel = labelcut.trim(mainlabel)
alllabels = labelcut.trim(alllabels.split())
# assert id in train_ids or id in test_ids, f'id {id} out of scope'
if id not in train_ids and id not in test_ids:
errors+=1
else:
# assert mainlabel == doc_labels[id][0], 'main label not consistent'
request = train_request if id in train_ids else test_request
request.append(WipoGammaDocument(id, text, mainlabel, alllabels))
print('pickling requests for faster subsequent runs')
pickle.dump(train_request, open(join(extracted_path,f'wipo-train-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
pickle.dump(test_request, open(join(extracted_path, f'wipo-test-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
if subset== 'train':
return train_request
else:
return test_request
if __name__=='__main__':
data_home = '../../datasets/WIPO/wipo-gamma/en'
extracted_path = '../../datasets/WIPO-extracted'
train = fetch_WIPOgamma(subset='train', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
test = fetch_WIPOgamma(subset='test', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
# train = fetch_WIPOgamma(subset='train', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
# test = fetch_WIPOgamma(subset='test', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
print('Done')

334
MultiLabel/multi_label.py Normal file
View File

@ -0,0 +1,334 @@
from copy import deepcopy
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, MultiTaskLassoCV, LassoLars, LassoLarsCV, \
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVC
from tqdm import tqdm
import quapy as qp
from functional import artificial_prevalence_sampling
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
from method.base import BaseQuantifier
from quapy.data import from_rcv2_lang_file, LabelledCollection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import numpy as np
from data.dataset import Dataset
def cls():
# return LinearSVC()
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
def calibratedCls():
return CalibratedClassifierCV(cls())
class MultilabelledCollection:
def __init__(self, instances, labels):
assert labels.ndim==2, 'data does not seem to be multilabel'
self.instances = instances
self.labels = labels
self.classes_ = np.arange(labels.shape[1])
@classmethod
def load(cls, path: str, loader_func: callable):
return MultilabelledCollection(*loader_func(path))
def __len__(self):
return self.instances.shape[0]
def prevalence(self):
# return self.labels.mean(axis=0)
pos = self.labels.mean(axis=0)
neg = 1-pos
return np.asarray([neg, pos]).T
def counts(self):
return self.labels.sum(axis=0)
@property
def n_classes(self):
return len(self.classes_)
@property
def binary(self):
return False
def __gen_index(self):
return np.arange(len(self))
def sampling_multi_index(self, size, cat, prev=None):
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=size>len(self))
aux = LabelledCollection(self.__gen_index(), self.labels[:,cat])
return aux.sampling_index(size, *[1-prev, prev])
def uniform_sampling_multi_index(self, size):
return np.random.choice(len(self), size, replace=size>len(self))
def uniform_sampling(self, size):
unif_index = self.uniform_sampling_multi_index(size)
return self.sampling_from_index(unif_index)
def sampling(self, size, category, prev=None):
prev_index = self.sampling_multi_index(size, category, prev)
return self.sampling_from_index(prev_index)
def sampling_from_index(self, index):
documents = self.instances[index]
labels = self.labels[index, :]
return MultilabelledCollection(documents, labels)
def train_test_split(self, train_prop=0.6, random_state=None):
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
dimensions = 2
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
yield self.sampling(sample_size, category, prevs)
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
dimensions = 2
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
yield self.sampling_multi_index(sample_size, category, prevs)
def natural_sampling_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling_multi_index(sample_size)
def asLabelledCollection(self, category):
return LabelledCollection(self.instances, self.labels[:,category])
def genLabelledCollections(self):
for c in self.classes_:
yield self.asLabelledCollection(c)
@property
def Xy(self):
return self.instances, self.labels
class MultilabelClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
self.base = deepcopy(OneVsRestClassifier(base_estimator))
self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.norm = StandardScaler()
def fit(self, X, y):
assert y.ndim==2, 'the dataset does not seem to be multi-label'
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class MLCC:
def __init__(self, mlcls:MultilabelClassifier):
self.mlcls = mlcls
def fit(self, data:MultilabelledCollection):
self.mlcls.fit(*data.Xy)
def quantify(self, instances):
pred = self.mlcls.predict(instances)
pos_prev = pred.mean(axis=0)
neg_prev = 1-pos_prev
return np.asarray([neg_prev, pos_prev]).T
class MLPCC:
def __init__(self, mlcls: MultilabelClassifier):
self.mlcls = mlcls
def fit(self, data: MultilabelledCollection):
self.mlcls.fit(*data.Xy)
def quantify(self, instances):
pred = self.mlcls.predict_proba(instances)
pos_prev = pred.mean(axis=0)
neg_prev = 1 - pos_prev
return np.asarray([neg_prev, pos_prev]).T
class MultilabelQuantifier:
def __init__(self, q:BaseQuantifier, n_jobs=-1):
self.q = q
self.estimators = None
self.n_jobs = n_jobs
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
def cat_job(lc):
return deepcopy(self.q).fit(lc)
self.estimators = qp.util.parallel(cat_job, data.genLabelledCollections(), n_jobs=self.n_jobs)
return self
def quantify(self, instances):
pos_prevs = np.zeros(len(self.classes_), dtype=float)
for c in self.classes_:
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
neg_prevs = 1-pos_prevs
return np.asarray([neg_prevs, pos_prevs]).T
class MultilabelRegressionQuantification:
def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500, norm=True,
means=True, stds=True):
assert regression in ['ridge'], 'unknown regression model'
self.estimator = MultilabelQuantifier(base_quantifier)
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.n_samples = n_samples
self.sample_size = sample_size
# self.norm = StandardScaler()
self.means = means
self.stds = stds
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
tr, te = data.train_test_split()
self.estimator.fit(tr)
samples_mean = []
samples_std = []
Xs = []
ys = []
for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
ys.append(sample.prevalence()[:,1])
Xs.append(self.estimator.quantify(sample.instances)[:,1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
Xs = np.asarray(Xs)
ys = np.asarray(ys)
if self.means:
samples_mean = np.asarray(samples_mean)
Xs = np.hstack([Xs, samples_mean])
if self.stds:
samples_std = np.asarray(samples_std)
Xs = np.hstack([Xs, samples_std])
# Xs = self.norm.fit_transform(Xs)
self.reg.fit(Xs, ys)
return self
def quantify(self, instances):
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
if self.means:
sample_mean = instances.mean(axis=0).getA()
Xs = np.hstack([Xs, sample_mean])
if self.stds:
sample_std = instances.todense().std(axis=0).getA()
Xs = np.hstack([Xs, sample_std])
# Xs = self.norm.transform(Xs)
adjusted = self.reg.predict(Xs)
adjusted = np.clip(adjusted, 0, 1)
adjusted = adjusted.flatten()
neg_prevs = 1-adjusted
return np.asarray([neg_prevs, adjusted]).T
sample_size = 250
n_samples = 1000
def models():
yield 'CC', MultilabelQuantifier(CC(cls()))
yield 'PCC', MultilabelQuantifier(PCC(cls()))
yield 'MLCC', MLCC(MultilabelClassifier(cls()))
yield 'MLPCC', MLPCC(MultilabelClassifier(cls()))
# yield 'PACC', MultilabelQuantifier(PACC(cls()))
# yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True}
# yield 'MRQ-CC', MultilabelRegressionQuantification(base_quantifier=CC(cls()), **common)
yield 'MRQ-PCC', MultilabelRegressionQuantification(base_quantifier=PCC(cls()), **common)
yield 'MRQ-PACC', MultilabelRegressionQuantification(base_quantifier=PACC(cls()), **common)
dataset = 'reuters21578'
data = Dataset.load(dataset, pickle_path=f'./pickles/{dataset}.pickle')
Xtr, Xte = data.vectorize()
ytr = data.devel_labelmatrix.todense().getA()
yte = data.test_labelmatrix.todense().getA()
most_populadted = np.argsort(ytr.sum(axis=0))[-25:]
ytr = ytr[:, most_populadted]
yte = yte[:, most_populadted]
train = MultilabelledCollection(Xtr, ytr)
test = MultilabelledCollection(Xte, yte)
print(f'Train-prev: {train.prevalence()[:,1]}')
print(f'Test-prev: {test.prevalence()[:,1]}')
print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')
# print('NPP:')
# test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=100))
# for model_name, model in models():
# model.fit(train)
# errs = []
# for index in test_indexes:
# sample = test.sampling_from_index(index)
# estim_prevs = model.quantify(sample.instances)
# true_prevs = sample.prevalence()
# errs.append(qp.error.mae(true_prevs, estim_prevs))
# print(f'{model_name:10s}\tmae={np.mean(errs):.5f}')
print('APP:')
test_indexes = []
for cat in train.classes_:
test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, category=cat, n_prevalences=21, repeats=10)))
for model_name, model in models():
model.fit(train)
macro_errs = []
for cat_indexes in test_indexes:
errs = []
for index in cat_indexes:
sample = test.sampling_from_index(index)
estim_prevs = model.quantify(sample.instances)
true_prevs = sample.prevalence()
errs.append(qp.error.mae(true_prevs, estim_prevs))
macro_errs.append(np.mean(errs))
print(f'{model_name:10s}\tmae={np.mean(macro_errs):.5f}')

0
MultiLabel/util/__init__.py Executable file
View File

145
MultiLabel/util/common.py Executable file
View File

@ -0,0 +1,145 @@
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from tqdm import tqdm
import torch
from scipy.sparse import vstack, issparse
from joblib import Parallel, delayed
import multiprocessing
import itertools
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
pbar = tqdm(data, desc=f'indexing documents')
for text in pbar:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths)+np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
def get_word_list(word2index1, word2index2=None): #TODO: redo
def extract_word_list(word2index):
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(word2index1)
if word2index2 is not None:
word_list += extract_word_list(word2index2)
return word_list
def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch_labels = labels[b*batchsize:(b+1)*batchsize]
if issparse(batch_labels):
batch_labels = batch_labels.toarray()
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
totype = torch.LongTensor if target_long else torch.FloatTensor
target = totype(batch_labels)
yield batch.to(device), target.to(device)
def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
yield batch.to(device)
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def predict(logits, classification_type='singlelabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_parallel_slices(n_tasks, n_jobs=-1):
if n_jobs==-1:
n_jobs = multiprocessing.cpu_count()
batch = int(n_tasks / n_jobs)
remainder = n_tasks % n_jobs
return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
def tokenize_job(documents, tokenizer, max_tokens, job):
return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')]
def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1):
slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs)
tokens = Parallel(n_jobs=n_jobs)(
delayed(tokenize_job)(
documents[slice_i], tokenizer, max_tokens, job
)
for job, slice_i in enumerate(slices)
)
return list(itertools.chain.from_iterable(tokens))

60
MultiLabel/util/csv_log.py Executable file
View File

@ -0,0 +1,60 @@
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
class CSVLog:
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
self.file = file
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file) and not overwrite:
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
self.columns = sorted(self.df.columns.values.tolist())
else:
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
assert columns is not None, 'columns cannot be None'
self.columns = sorted(columns)
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
self.defaults = {}
def already_calculated(self, **kwargs):
df = self.df
if df.shape[0] == 0:
return False
if len(kwargs) == 0:
kwargs = self.defaults
for key,val in kwargs.items():
df = df.loc[df[key] == val]
if df.shape[0] == 0:
return False
return True
def set_default(self, param, value):
self.defaults[param] = value
def add_row(self, **kwargs):
for key in self.defaults.keys():
if key not in kwargs:
kwargs[key]=self.defaults[key]
colums = sorted(list(kwargs.keys()))
values = [kwargs[col_i] for col_i in colums]
s = pd.Series(values, index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(kwargs)
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)

View File

@ -0,0 +1,33 @@
from data.dataset import Dataset
from tqdm import tqdm
import os
import numpy as np
def write_data(documents, labels, fout):
print(f'there are {len(documents)} documents')
written, empty = 0, 0
with open(fout, 'wt') as foo:
for doc, label in tqdm(list(zip(documents, labels))):
doc = doc.replace('\t', ' ').replace('\n', ' ').strip()
label = np.squeeze(np.asarray(label.todense()))
label = ' '.join([f'{x}' for x in label])
if doc:
foo.write(f'{label}\t{doc}\n')
written += 1
else:
foo.write(f'{label}\tempty document\n')
empty += 1
print(f'written = {written}')
print(f'empty = {empty}')
for dataset_name in ['reuters21578', 'ohsumed', 'jrcall', 'rcv1', 'wipo-sl-sc']: #'20newsgroups'
dataset = Dataset.load(dataset_name=dataset_name, pickle_path=f'../pickles/{dataset_name}.pickle').show()
os.makedirs(f'../leam/{dataset_name}', exist_ok=True)
write_data(dataset.devel_raw, dataset.devel_labelmatrix, f'../leam/{dataset_name}/train.csv')
#write_data(dataset.test_raw, dataset.test_labelmatrix, f'../leam/{dataset_name}/test.csv')
print('done')

View File

@ -0,0 +1,3 @@
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

54
MultiLabel/util/early_stop.py Executable file
View File

@ -0,0 +1,54 @@
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
import torch
from time import time
from util.file import create_if_not_exist
class EarlyStopping:
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience
self.patience = patience
self.verbose = verbose
self.best_score = None
self.best_epoch = None
self.stop_time = None
self.checkpoint = checkpoint
self.model = model
self.STOP = False
def __call__(self, watch_score, epoch):
if self.STOP:
return #done
if self.best_score is None or watch_score >= self.best_score:
self.best_score = watch_score
self.best_epoch = epoch
self.stop_time = time()
if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
torch.save(self.model, self.checkpoint)
else:
self.print(f'[early-stop] improved')
self.patience = self.patience_limit
else:
self.patience -= 1
if self.patience == 0:
self.STOP = True
self.print(f'[early-stop] patience exhausted')
else:
if self.patience>0: # if negative, then early-stop is ignored
self.print(f'[early-stop] patience={self.patience}')
def reinit_counter(self):
self.STOP = False
self.patience=self.patience_limit
def restore_checkpoint(self):
return torch.load(self.checkpoint)
def print(self, msg):
if self.verbose:
print(msg)

38
MultiLabel/util/file.py Executable file
View File

@ -0,0 +1,38 @@
import urllib.request
from os import listdir, makedirs
from os.path import isdir, isfile, join, exists, dirname
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if exists(archive_path): return
create_if_not_exist(dirname(archive_path))
download_file(url,archive_path)
def ls(dir, typecheck):
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
el.sort()
return el
def list_dirs(dir):
return ls(dir, typecheck=isdir)
def list_files(dir):
return ls(dir, typecheck=isfile)
def create_if_not_exist(path):
if not exists(path): makedirs(path)

86
MultiLabel/util/metrics.py Executable file
View File

@ -0,0 +1,86 @@
import numpy as np
from scipy.sparse import lil_matrix, issparse
from sklearn.metrics import f1_score, accuracy_score
"""
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
"""
def evaluation(y_true, y_pred, classification_type):
if classification_type == 'multilabel':
eval_function = multilabel_eval
elif classification_type == 'singlelabel':
eval_function = singlelabel_eval
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
return Mf1, mf1, accuracy
def multilabel_eval(y, y_):
tp = y.multiply(y_)
fn = lil_matrix(y.shape)
true_ones = y==1
fn[true_ones]=1-tp[true_ones]
fp = lil_matrix(y.shape)
pred_ones = y_==1
if pred_ones.nnz>0:
fp[pred_ones]=1-tp[pred_ones]
#macro-f1
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
pos_pred = tp_macro+fp_macro
pos_true = tp_macro+fn_macro
prec=np.zeros(shape=tp_macro.shape,dtype=float)
rec=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
den=prec+rec
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
macrof1 *=2
macrof1[(pos_pred==0)*(pos_true==0)]=1
macrof1 = np.mean(macrof1)
#micro-f1
tp_micro = tp_macro.sum()
fn_micro = fn_macro.sum()
fp_micro = fp_macro.sum()
pos_pred = tp_micro + fp_micro
pos_true = tp_micro + fn_micro
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
rec = (tp_micro / pos_true) if pos_true>0 else 0
den = prec+rec
microf1 = 2*prec*rec/den if den>0 else 0
if pos_pred==pos_true==0:
microf1=1
#accuracy
ndecisions = np.multiply(*y.shape)
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
acc = (tp_micro+tn)/ndecisions
return macrof1,microf1,acc
def singlelabel_eval(y, y_):
if issparse(y_): y_ = y_.toarray().flatten()
macrof1 = f1_score(y, y_, average='macro')
microf1 = f1_score(y, y_, average='micro')
acc = accuracy_score(y, y_)
return macrof1,microf1,acc

View File

@ -0,0 +1,65 @@
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np
from joblib import Parallel, delayed
from time import time
class MLSVC:
"""
Multi-Label Support Vector Machine, with individual optimizations per binary problem.
"""
def __init__(self, n_jobs=1, estimator=LinearSVC, *args, **kwargs):
self.n_jobs = n_jobs
self.args = args
self.kwargs = kwargs
self.verbose = False if 'verbose' not in self.kwargs else self.kwargs['verbose']
self.estimator = estimator
def fit(self, X, y, **grid_search_params):
tini = time()
assert len(y.shape)==2 and set(np.unique(y).tolist()) == {0,1}, 'data format is not multi-label'
nD,nC = y.shape
prevalence = np.sum(y, axis=0)
self.svms = np.array([self.estimator(*self.args, **self.kwargs) for _ in range(nC)])
if grid_search_params and grid_search_params['param_grid']:
self._print('grid_search activated with: {}'.format(grid_search_params))
# Grid search cannot be performed if the category prevalence is less than the parameter cv.
# In those cases we place a svm instead of a gridsearchcv
cv = 5 if 'cv' not in grid_search_params else grid_search_params['cv']
assert isinstance(cv, int), 'cv must be an int (other policies are not supported yet)'
self.svms = [GridSearchCV(svm_i, refit=True, **grid_search_params) if prevalence[i]>=cv else svm_i
for i,svm_i in enumerate(self.svms)]
for i in np.argwhere(prevalence==0).flatten():
self.svms[i] = TrivialRejector()
self.svms = Parallel(n_jobs=self.n_jobs)(
delayed(self.svms[c].fit)(X,y[:,c]) for c,svm in enumerate(self.svms)
)
self.training_time = time() - tini
def predict(self, X):
return np.vstack(list(map(lambda svmi: svmi.predict(X), self.svms))).T
def predict_proba(self, X):
return np.vstack(map(lambda svmi: svmi.predict_proba(X)[:,np.argwhere(svmi.classes_==1)[0,0]], self.svms)).T
def _print(self, msg):
if self.verbose>0:
print(msg)
def best_params(self):
return [svmi.best_params_ if isinstance(svmi, GridSearchCV) else None for svmi in self.svms]
class TrivialRejector:
def fit(self,*args,**kwargs): return self
def predict(self, X): return np.zeros(X.shape[0])
def predict_proba(self, X): return np.zeros(X.shape[0])

View File

@ -1,224 +0,0 @@
from copy import deepcopy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import quapy as qp
from functional import artificial_prevalence_sampling
from method.aggregative import PACC, CC, EMQ
from method.base import BaseQuantifier
from quapy.data import from_rcv2_lang_file, LabelledCollection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
class MultilabelledCollection:
def __init__(self, instances, labels):
assert labels.ndim==2, 'data does not seem to be multilabel'
self.instances = instances
self.labels = labels
self.classes_ = np.arange(labels.shape[1])
@classmethod
def load(cls, path: str, loader_func: callable):
return MultilabelledCollection(*loader_func(path))
def __len__(self):
return self.instances.shape[0]
def prevalence(self):
# return self.labels.mean(axis=0)
pos = self.labels.mean(axis=0)
neg = 1-pos
return np.asarray([neg, pos]).T
def counts(self):
return self.labels.sum(axis=0)
@property
def n_classes(self):
return len(self.classes_)
@property
def binary(self):
return False
def __gen_index(self):
return np.arange(len(self))
def sampling_multi_index(self, size, cat, prev=None):
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=size>len(self))
aux = LabelledCollection(self.__gen_index(), self.instances[:,cat])
return aux.sampling_index(size, *[1-prev, prev])
def uniform_sampling_multi_index(self, size):
return np.random.choice(len(self), size, replace=size>len(self))
def uniform_sampling(self, size):
unif_index = self.uniform_sampling_multi_index(size)
return self.sampling_from_index(unif_index)
def sampling(self, size, category, prev=None):
prev_index = self.sampling_multi_index(size, category, prev)
return self.sampling_from_index(prev_index)
def sampling_from_index(self, index):
documents = self.instances[index]
labels = self.labels[index, :]
return MultilabelledCollection(documents, labels)
def train_test_split(self, train_prop=0.6, random_state=None):
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
dimensions = 2
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling(sample_size, category, prevs[1])
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
dimensions = 2
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_multi_index(sample_size, category, prevs[1])
def natural_sampling_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling_multi_index(sample_size)
def asLabelledCollection(self, category):
return LabelledCollection(self.instances, self.labels[:,category])
def genLabelledCollections(self):
for c in self.classes_:
yield self.asLabelledCollection(c)
@property
def Xy(self):
return self.instances, self.labels
class MultilabelQuantifier:
def __init__(self, q:BaseQuantifier):
self.q = q
self.estimators = {}
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
for cat, lc in enumerate(data.genLabelledCollections()):
self.estimators[cat] = deepcopy(self.q).fit(lc)
return self
def quantify(self, instances):
pos_prevs = np.zeros(len(self.classes_), dtype=float)
for c in self.classes_:
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
neg_prevs = 1-pos_prevs
return np.asarray([neg_prevs, pos_prevs]).T
class MultilabelRegressionQuantification:
def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500):
self.estimator = MultilabelQuantifier(base_quantifier)
self.regression = regression
self.n_samples = n_samples
self.sample_size = sample_size
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
tr, te = data.train_test_split()
self.estimator.fit(tr)
Xs = []
ys = []
for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
ys.append(sample.prevalence()[:,1])
Xs.append(self.estimator.quantify(sample.instances)[:,1])
Xs = np.asarray(Xs)
ys = np.asarray(ys)
print(f'Xs in {Xs.shape}')
print(f'ys in {ys.shape}')
self.reg = Ridge().fit(Xs, ys) #normalize?
return self
def quantify(self, instances):
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
adjusted = self.reg.predict(Xs)
adjusted = np.clip(adjusted, 0, 1)
adjusted = adjusted.flatten()
neg_prevs = 1-adjusted
return np.asarray([neg_prevs, adjusted]).T
# read documents
path = f'./crosslingual_data/rcv12/en.small.txt'
docs, cats = from_rcv2_lang_file(path)
# split train-test
tr_docs, te_docs, tr_cats, te_cats = train_test_split(docs, cats, test_size=0.2, random_state=42)
# generate Y matrices
mlb = MultiLabelBinarizer()
ytr = mlb.fit_transform([cats.split(' ') for cats in tr_cats])
yte = mlb.transform([cats.split(' ') for cats in te_cats])
# retain 10 most populated categories
most_populated = np.argsort(ytr.sum(axis=0))[-10:]
ytr = ytr[:,most_populated]
yte = yte[:,most_populated]
tfidf = TfidfVectorizer(min_df=5)
Xtr = tfidf.fit_transform(tr_docs)
Xte = tfidf.transform(te_docs)
train = MultilabelledCollection(Xtr, ytr)
test = MultilabelledCollection(Xte, yte)
model = MultilabelQuantifier(PACC(LogisticRegression()))
model.fit(train)
estim_prevs = model.quantify(test.instances)
true_prevs = test.prevalence()
print('PACC:')
print(estim_prevs)
print(true_prevs)
model = MultilabelQuantifier(CC(LogisticRegression()))
model.fit(train)
estim_prevs = model.quantify(test.instances)
true_prevs = test.prevalence()
print('CC:')
print(estim_prevs)
print(true_prevs)
# model = MultilabelQuantifier(EMQ(LogisticRegression()))
# model.fit(train)
# estim_prevs = model.quantify(test.instances)
# true_prevs = test.prevalence()
# print('EMQ:')
# print(estim_prevs)
# print(true_prevs)
model = MultilabelRegressionQuantification(sample_size=200, n_samples=500)
model.fit(train)
estim_prevs = model.quantify(test.instances)
true_prevs = test.prevalence()
print('MRQ:')
print(estim_prevs)
print(true_prevs)
qp.environ['SAMPLE_SIZE']=100
mae = qp.error.mae(true_prevs, estim_prevs)
print(mae)

View File

@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix
from tqdm import tqdm
def from_rcv2_lang_file(path, encoding='utf-8'):
lines = open(path, 'rt', encoding=encoding).readlines()
parts = [l.split('\t') for l in lines]
docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts]))
return docs, cats
def from_text(path, encoding='utf-8'):
"""
Reas a labelled colletion of documents.

View File

@ -105,7 +105,7 @@ def _predict_from_indexes(
estim_prevalence = quantification_func(sample.instances)
return true_prevalence, estim_prevalence
pbar = tqdm(indexes, desc='[artificial sampling protocol] generating predictions') if verbose else indexes
pbar = tqdm(indexes, desc='[sampling protocol] generating predictions') if verbose else indexes
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
true_prevalences, estim_prevalences = zip(*results)

View File

@ -227,7 +227,7 @@ def _delayed_new_instance(args):
if val_split is not None:
if isinstance(val_split, float):
assert 0 < val_split < 1, 'val_split should be in (0,1)'
data, val_split = data.split_stratified(train_prop=1 - val_split)
data, val_split = data.train_test_split(train_prop=1 - val_split)
sample_index = data.sampling_index(sample_size, *prev)
sample = data.sampling_from_index(sample_index)

View File

@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier):
if fit_learner:
classifier_data, unused_data = data.split_stratified(0.4)
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20%
self.learner.fit(*classifier_data.Xy)
else:
classifier_data = None

View File

@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier):
return training, validation
elif isinstance(validation, float):
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
training, validation = training.split_stratified(train_prop=1 - validation)
training, validation = training.train_test_split(train_prop=1 - validation)
return training, validation
else:
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'