forked from moreo/QuaPy
trying stuff with multilabels
This commit is contained in:
commit
a4fea89122
|
@ -0,0 +1,229 @@
|
|||
import os,sys
|
||||
from sklearn.datasets import get_data_home, fetch_20newsgroups
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from jrcacquis_reader import fetch_jrcacquis, JRCAcquis_Document
|
||||
from ohsumed_reader import fetch_ohsumed50k
|
||||
from reuters21578_reader import fetch_reuters21578
|
||||
from rcv_reader import fetch_RCV1
|
||||
from wipo_reader import fetch_WIPOgamma, WipoGammaDocument
|
||||
import pickle
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from os.path import join
|
||||
import re
|
||||
|
||||
|
||||
def init_vectorizer():
|
||||
return TfidfVectorizer(min_df=5, sublinear_tf=True)
|
||||
|
||||
|
||||
class Dataset:
|
||||
|
||||
dataset_available = {'reuters21578', '20newsgroups', 'ohsumed', 'rcv1', 'ohsumed', 'jrcall',
|
||||
'wipo-sl-mg','wipo-ml-mg','wipo-sl-sc','wipo-ml-sc'}
|
||||
|
||||
def __init__(self, name):
|
||||
assert name in Dataset.dataset_available, f'dataset {name} is not available'
|
||||
if name=='reuters21578':
|
||||
self._load_reuters()
|
||||
elif name == '20newsgroups':
|
||||
self._load_20news()
|
||||
elif name == 'rcv1':
|
||||
self._load_rcv1()
|
||||
elif name == 'ohsumed':
|
||||
self._load_ohsumed()
|
||||
elif name == 'jrcall':
|
||||
self._load_jrc(version='all')
|
||||
elif name == 'wipo-sl-mg':
|
||||
self._load_wipo('singlelabel', 'maingroup')
|
||||
elif name == 'wipo-ml-mg':
|
||||
self._load_wipo('multilabel', 'maingroup')
|
||||
elif name == 'wipo-sl-sc':
|
||||
self._load_wipo('singlelabel', 'subclass')
|
||||
elif name == 'wipo-ml-sc':
|
||||
self._load_wipo('multilabel', 'subclass')
|
||||
|
||||
self.nC = self.devel_labelmatrix.shape[1]
|
||||
self._vectorizer = init_vectorizer()
|
||||
self._vectorizer.fit(self.devel_raw)
|
||||
self.vocabulary = self._vectorizer.vocabulary_
|
||||
|
||||
def show(self):
|
||||
nTr_docs = len(self.devel_raw)
|
||||
nTe_docs = len(self.test_raw)
|
||||
nfeats = len(self._vectorizer.vocabulary_)
|
||||
nC = self.devel_labelmatrix.shape[1]
|
||||
nD=nTr_docs+nTe_docs
|
||||
print(f'{self.classification_type}, nD={nD}=({nTr_docs}+{nTe_docs}), nF={nfeats}, nC={nC}')
|
||||
return self
|
||||
|
||||
def _load_reuters(self):
|
||||
data_path = os.path.join(get_data_home(), 'reuters21578')
|
||||
devel = fetch_reuters21578(subset='train', data_path=data_path)
|
||||
test = fetch_reuters21578(subset='test', data_path=data_path)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_rcv1(self):
|
||||
data_path = '../datasets/RCV1-v2/unprocessed_corpus' #TODO: check when missing
|
||||
devel = fetch_RCV1(subset='train', data_path=data_path)
|
||||
test = fetch_RCV1(subset='test', data_path=data_path)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_jrc(self, version):
|
||||
assert version in ['300','all'], 'allowed versions are "300" or "all"'
|
||||
data_path = "../datasets/JRC_Acquis_v3"
|
||||
tr_years=list(range(1986, 2006))
|
||||
te_years=[2006]
|
||||
if version=='300':
|
||||
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1,most_frequent=300)
|
||||
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
|
||||
else:
|
||||
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1)
|
||||
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
|
||||
print(f'load jrc-acquis (English) with {len(tr_cats)} tr categories ({len(te_cats)} te categories)')
|
||||
|
||||
devel_data = JRCAcquis_Document.get_text(training_docs)
|
||||
test_data = JRCAcquis_Document.get_text(test_docs)
|
||||
devel_target = JRCAcquis_Document.get_target(training_docs)
|
||||
test_target = JRCAcquis_Document.get_target(test_docs)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_ohsumed(self):
|
||||
data_path = os.path.join(get_data_home(), 'ohsumed50k')
|
||||
devel = fetch_ohsumed50k(subset='train', data_path=data_path)
|
||||
test = fetch_ohsumed50k(subset='test', data_path=data_path)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_20news(self):
|
||||
metadata = ('headers', 'footers', 'quotes')
|
||||
devel = fetch_20newsgroups(subset='train', remove=metadata)
|
||||
test = fetch_20newsgroups(subset='test', remove=metadata)
|
||||
self.classification_type = 'singlelabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_target, self.test_target = devel.target, test.target
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1,1), self.test_target.reshape(-1,1))
|
||||
|
||||
def _load_fasttext_data(self,name):
|
||||
data_path='../datasets/fastText'
|
||||
self.classification_type = 'singlelabel'
|
||||
name=name.replace('-','_')
|
||||
train_file = join(data_path,f'{name}.train')
|
||||
assert os.path.exists(train_file), f'file {name} not found, please place the fasttext data in {data_path}' #' or specify the path' #todo
|
||||
self.devel_raw, self.devel_target = load_fasttext_format(train_file)
|
||||
self.test_raw, self.test_target = load_fasttext_format(join(data_path, f'{name}.test'))
|
||||
self.devel_raw = mask_numbers(self.devel_raw)
|
||||
self.test_raw = mask_numbers(self.test_raw)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
|
||||
|
||||
def _load_wipo(self, classmode, classlevel):
|
||||
assert classmode in {'singlelabel', 'multilabel'}, 'available class_mode are sl (single-label) or ml (multi-label)'
|
||||
data_path = '../datasets/WIPO/wipo-gamma/en'
|
||||
data_proc = '../datasets/WIPO-extracted'
|
||||
|
||||
devel = fetch_WIPOgamma(subset='train', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
|
||||
test = fetch_WIPOgamma(subset='test', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
|
||||
|
||||
devel_data = [d.text for d in devel]
|
||||
test_data = [d.text for d in test]
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
|
||||
|
||||
self.classification_type = classmode
|
||||
if classmode== 'multilabel':
|
||||
devel_target = [d.all_labels for d in devel]
|
||||
test_target = [d.all_labels for d in test]
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
else:
|
||||
devel_target = [d.main_label for d in devel]
|
||||
test_target = [d.main_label for d in test]
|
||||
# only for labels with at least one training document
|
||||
class_id = {labelname:index for index,labelname in enumerate(sorted(set(devel_target)))}
|
||||
devel_target = np.array([class_id[id] for id in devel_target]).astype(int)
|
||||
test_target = np.array([class_id.get(id,None) for id in test_target])
|
||||
if None in test_target:
|
||||
print(f'deleting {(test_target==None).sum()} test documents without valid categories')
|
||||
keep_pos = test_target!=None
|
||||
self.test_raw = (np.asarray(self.test_raw)[keep_pos]).tolist()
|
||||
test_target = test_target[keep_pos]
|
||||
test_target=test_target.astype(int)
|
||||
self.devel_target, self.test_target = devel_target, test_target
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
|
||||
|
||||
def vectorize(self):
|
||||
if not hasattr(self, 'Xtr') or not hasattr(self, 'Xte'):
|
||||
self.Xtr = self._vectorizer.transform(self.devel_raw)
|
||||
self.Xte = self._vectorizer.transform(self.test_raw)
|
||||
self.Xtr.sort_indices()
|
||||
self.Xte.sort_indices()
|
||||
return self.Xtr, self.Xte
|
||||
|
||||
def analyzer(self):
|
||||
return self._vectorizer.build_analyzer()
|
||||
|
||||
@classmethod
|
||||
def load(cls, dataset_name, pickle_path=None):
|
||||
|
||||
if pickle_path:
|
||||
if os.path.exists(pickle_path):
|
||||
print(f'loading pickled dataset from {pickle_path}')
|
||||
dataset = pickle.load(open(pickle_path, 'rb'))
|
||||
else:
|
||||
print(f'fetching dataset and dumping it into {pickle_path}')
|
||||
dataset = Dataset(name=dataset_name)
|
||||
print('vectorizing for faster processing')
|
||||
dataset.vectorize()
|
||||
print('dumping')
|
||||
pickle.dump(dataset, open(pickle_path, 'wb', pickle.HIGHEST_PROTOCOL))
|
||||
else:
|
||||
print(f'loading dataset {dataset_name}')
|
||||
dataset = Dataset(name=dataset_name)
|
||||
|
||||
print('[Done]')
|
||||
return dataset
|
||||
|
||||
|
||||
def _label_matrix(tr_target, te_target):
|
||||
mlb = MultiLabelBinarizer(sparse_output=True)
|
||||
ytr = mlb.fit_transform(tr_target)
|
||||
yte = mlb.transform(te_target)
|
||||
print(mlb.classes_)
|
||||
return ytr, yte
|
||||
|
||||
|
||||
def load_fasttext_format(path):
|
||||
print(f'loading {path}')
|
||||
labels,docs=[],[]
|
||||
for line in tqdm(open(path, 'rt').readlines()):
|
||||
space = line.strip().find(' ')
|
||||
label = int(line[:space].replace('__label__',''))-1
|
||||
labels.append(label)
|
||||
docs.append(line[space+1:])
|
||||
labels=np.asarray(labels,dtype=int)
|
||||
return docs,labels
|
||||
|
||||
|
||||
def mask_numbers(data, number_mask='numbermask'):
|
||||
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
masked.append(mask.sub(number_mask, text))
|
||||
return masked
|
||||
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
import os, sys
|
||||
from os.path import join
|
||||
import tarfile
|
||||
import xml.etree.ElementTree as ET
|
||||
from sklearn.datasets import get_data_home
|
||||
import pickle
|
||||
import rdflib
|
||||
from rdflib.namespace import RDF, SKOS
|
||||
from rdflib import URIRef
|
||||
import zipfile
|
||||
from collections import Counter
|
||||
from tqdm import tqdm
|
||||
from random import shuffle
|
||||
from util.file import *
|
||||
|
||||
|
||||
class JRCAcquis_Document:
|
||||
def __init__(self, id, name, lang, year, head, body, categories):
|
||||
self.id = id
|
||||
self.parallel_id = name
|
||||
self.lang = lang
|
||||
self.year = year
|
||||
self.text = body if not head else head + "\n" + body
|
||||
self.categories = categories
|
||||
|
||||
@classmethod
|
||||
def get_text(cls, jrc_documents):
|
||||
return [d.text for d in jrc_documents]
|
||||
|
||||
@classmethod
|
||||
def get_target(cls, jrc_documents):
|
||||
return [d.categories for d in jrc_documents]
|
||||
|
||||
|
||||
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
|
||||
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
|
||||
# standard codification), so it might be preferable not to read the header after all (as here by default)
|
||||
def _proc_acute(text):
|
||||
for ch in ['a','e','i','o','u']:
|
||||
text = text.replace('%'+ch+'acute%',ch)
|
||||
return text
|
||||
|
||||
def parse_document(file, year, head=False):
|
||||
root = ET.parse(file).getroot()
|
||||
|
||||
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
|
||||
doc_lang = root.attrib['lang'] # e.g., 'es'
|
||||
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
|
||||
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
|
||||
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
|
||||
|
||||
def raise_if_empty(field, from_file):
|
||||
if isinstance(field, str):
|
||||
if not field.strip():
|
||||
raise ValueError("Empty field in file %s" % from_file)
|
||||
|
||||
raise_if_empty(doc_name, file)
|
||||
raise_if_empty(doc_lang, file)
|
||||
raise_if_empty(doc_id, file)
|
||||
if head: raise_if_empty(doc_head, file)
|
||||
raise_if_empty(doc_body, file)
|
||||
|
||||
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
|
||||
|
||||
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
|
||||
def _filter_by_category(doclist, cat_filter):
|
||||
if not isinstance(cat_filter, frozenset):
|
||||
cat_filter = frozenset(cat_filter)
|
||||
filtered = []
|
||||
for doc in doclist:
|
||||
doc.categories = list(cat_filter & set(doc.categories))
|
||||
if doc.categories:
|
||||
doc.categories.sort()
|
||||
filtered.append(doc)
|
||||
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
|
||||
return filtered
|
||||
|
||||
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
|
||||
def _filter_by_frequency(doclist, cat_threshold):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
#select top most_frequent categories (and filters documents containing those categories)
|
||||
def _most_common(doclist, most_frequent):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
def _get_categories(request):
|
||||
final_cats = set()
|
||||
for d in request:
|
||||
final_cats.update(d.categories)
|
||||
return list(final_cats)
|
||||
|
||||
def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True,
|
||||
cat_filter=None, cat_threshold=0, most_frequent=-1,
|
||||
DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
|
||||
|
||||
if not data_path:
|
||||
data_path = get_data_home()
|
||||
|
||||
if not os.path.exists(data_path):
|
||||
os.mkdir(data_path)
|
||||
|
||||
request = []
|
||||
total_read = 0
|
||||
file_name = 'jrc-' + lang + '.tgz'
|
||||
archive_path = join(data_path, file_name)
|
||||
|
||||
if not os.path.exists(archive_path):
|
||||
print("downloading language-specific dataset (once and for all) into %s" % data_path)
|
||||
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
|
||||
download_file(DOWNLOAD_URL, archive_path)
|
||||
print("untarring dataset...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(data_path)
|
||||
|
||||
documents_dir = join(data_path, lang)
|
||||
|
||||
print("Reading documents...")
|
||||
read = 0
|
||||
for dir in list_dirs(documents_dir):
|
||||
year = int(dir)
|
||||
if years==None or year in years:
|
||||
year_dir = join(documents_dir,dir)
|
||||
l_y_documents = []
|
||||
all_documents = list_files(year_dir)
|
||||
empty = 0
|
||||
pbar = tqdm(enumerate(all_documents))
|
||||
for i,doc_file in pbar:
|
||||
try:
|
||||
jrc_doc = parse_document(join(year_dir, doc_file), year)
|
||||
except ValueError:
|
||||
jrc_doc = None
|
||||
|
||||
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
|
||||
l_y_documents.append(jrc_doc)
|
||||
else: empty += 1
|
||||
read+=1
|
||||
pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields')
|
||||
request += l_y_documents
|
||||
print("Read %d documents for language %s\n" % (read, lang))
|
||||
total_read += read
|
||||
|
||||
final_cats = _get_categories(request)
|
||||
|
||||
if cat_filter:
|
||||
request = _filter_by_category(request, cat_filter)
|
||||
final_cats = _get_categories(request)
|
||||
if cat_threshold > 0:
|
||||
request, final_cats = _filter_by_frequency(request, cat_threshold)
|
||||
if most_frequent != -1 and len(final_cats) > most_frequent:
|
||||
request, final_cats = _most_common(request, most_frequent)
|
||||
|
||||
return request, final_cats
|
||||
|
||||
def print_cat_analysis(request):
|
||||
cat_count = Counter()
|
||||
for d in request:
|
||||
cat_count.update(d.categories)
|
||||
print("Number of active categories: {}".format(len(cat_count)))
|
||||
print(cat_count.most_common())
|
||||
|
||||
# inspects the Eurovoc thesaurus in order to select a subset of categories
|
||||
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
|
||||
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
|
||||
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
|
||||
select="broadest"):
|
||||
|
||||
fullpath_pickle = join(data_path, select+'_concepts.pickle')
|
||||
if os.path.exists(fullpath_pickle):
|
||||
print("Pickled object found in %s. Loading it." % fullpath_pickle)
|
||||
return pickle.load(open(fullpath_pickle,'rb'))
|
||||
|
||||
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
|
||||
if not os.path.exists(fullpath):
|
||||
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
|
||||
download_file(eurovoc_url, fullpath)
|
||||
print("Unzipping file...")
|
||||
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
|
||||
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
|
||||
zipped.close()
|
||||
|
||||
print("Parsing %s" %fullpath)
|
||||
g = rdflib.Graph()
|
||||
g.parse(location=fullpath, format="application/rdf+xml")
|
||||
|
||||
if select == "all":
|
||||
print("Selecting all concepts")
|
||||
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
|
||||
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
|
||||
all_concepts.sort()
|
||||
selected_concepts = all_concepts
|
||||
elif select=="broadest":
|
||||
print("Selecting broadest concepts (those without any other broader concept linked to it)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
narrower_concepts = set(g.subjects(SKOS.broader, None))
|
||||
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
|
||||
broadest_concepts.sort()
|
||||
selected_concepts = broadest_concepts
|
||||
elif select=="leaves":
|
||||
print("Selecting leaves concepts (those not linked as broader of any other concept)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
broad_concepts = set(g.objects(None, SKOS.broader))
|
||||
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
|
||||
leave_concepts.sort()
|
||||
selected_concepts = leave_concepts
|
||||
else:
|
||||
raise ValueError("Selection policy %s is not currently supported" % select)
|
||||
|
||||
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
|
||||
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
|
||||
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return selected_concepts
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# example code
|
||||
|
||||
train_years = list(range(1986, 2006))
|
||||
test_years = [2006]
|
||||
cat_policy = 'all' #'leaves'
|
||||
most_common_cat = 300
|
||||
JRC_DATAPATH = "../datasets/JRC_Acquis_v3"
|
||||
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
|
||||
|
||||
training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years,
|
||||
cat_filter=None, cat_threshold=1,
|
||||
most_frequent=most_common_cat)
|
||||
test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years,
|
||||
cat_filter=tr_cats, cat_threshold=1)
|
||||
# training_cats = jrc_get_categories(training_docs)
|
||||
# test_cats = jrc_get_categories(test_docs)
|
||||
# intersection_cats = [c for c in training_cats if c in test_cats]
|
||||
|
||||
# training_docs = jrc_filter_by_category(training_docs, intersection_cats)
|
||||
# test_docs = jrc_filter_by_category(test_docs, intersection_cats)
|
||||
|
||||
|
||||
print(f'JRC-train: {len(training_docs)} documents')
|
||||
print(f'JRC-test: {len(test_docs)} documents')
|
||||
|
||||
print_cat_analysis(training_docs)
|
||||
print_cat_analysis(test_docs)
|
||||
|
||||
"""
|
||||
JRC-train: 12615 documents, 300 cats
|
||||
JRC-test: 7055 documents, 300 cats
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
class LabelledDocuments:
|
||||
def __init__(self, data, target, target_names):
|
||||
self.data=data
|
||||
self.target=target
|
||||
self.target_names=target_names
|
|
@ -0,0 +1,63 @@
|
|||
import os
|
||||
import pickle
|
||||
import tarfile
|
||||
from os.path import join
|
||||
import urllib.request
|
||||
from data.labeled import LabelledDocuments
|
||||
from util.file import create_if_not_exist, download_file_if_not_exists
|
||||
import math
|
||||
|
||||
|
||||
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7):
|
||||
_dataname = 'ohsumed50k'
|
||||
if data_path is None:
|
||||
data_path = join(os.path.expanduser('~'), _dataname)
|
||||
create_if_not_exist(data_path)
|
||||
|
||||
pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle')
|
||||
if not os.path.exists(pickle_file):
|
||||
DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz')
|
||||
archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz')
|
||||
download_file_if_not_exists(DOWNLOAD_URL, archive_path)
|
||||
untardir = 'ohsumed-all'
|
||||
if not os.path.exists(os.path.join(data_path, untardir)):
|
||||
print("untarring ohsumed...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(data_path)
|
||||
|
||||
target_names = []
|
||||
doc_classes = dict()
|
||||
class_docs = dict()
|
||||
content = dict()
|
||||
doc_ids = set()
|
||||
for cat_id in os.listdir(join(data_path, untardir)):
|
||||
target_names.append(cat_id)
|
||||
class_docs[cat_id] = []
|
||||
for doc_id in os.listdir(join(data_path, untardir, cat_id)):
|
||||
doc_ids.add(doc_id)
|
||||
text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read()
|
||||
if doc_id not in doc_classes: doc_classes[doc_id] = []
|
||||
doc_classes[doc_id].append(cat_id)
|
||||
if doc_id not in content: content[doc_id] = text_content
|
||||
class_docs[cat_id].append(doc_id)
|
||||
target_names.sort()
|
||||
print('Read %d different documents' % len(doc_ids))
|
||||
|
||||
splitdata = dict({'train': [], 'test': []})
|
||||
for cat_id in target_names:
|
||||
free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])]
|
||||
if len(free_docs) > 0:
|
||||
split_point = int(math.floor(len(free_docs) * train_test_split))
|
||||
splitdata['train'].extend(free_docs[:split_point])
|
||||
splitdata['test'].extend(free_docs[split_point:])
|
||||
for split in ['train', 'test']:
|
||||
dataset = LabelledDocuments([], [], target_names)
|
||||
for doc_id in splitdata[split]:
|
||||
dataset.data.append(content[doc_id])
|
||||
dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]])
|
||||
pickle.dump(dataset,
|
||||
open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'),
|
||||
protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
print(pickle_file)
|
||||
return pickle.load(open(pickle_file, 'rb'))
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
from zipfile import ZipFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from data.labeled import LabelledDocuments
|
||||
from util.file import list_files
|
||||
from os.path import join, exists
|
||||
from util.file import download_file_if_not_exists
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
|
||||
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
|
||||
|
||||
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
|
||||
'lyrl2004_tokens_test_pt1.dat.gz',
|
||||
'lyrl2004_tokens_test_pt2.dat.gz',
|
||||
'lyrl2004_tokens_test_pt3.dat.gz']
|
||||
|
||||
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
|
||||
|
||||
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
|
||||
|
||||
class RCV_Document:
|
||||
def __init__(self, id, text, categories, date=''):
|
||||
self.id = id
|
||||
self.date = date
|
||||
self.text = text
|
||||
self.categories = categories
|
||||
|
||||
class IDRangeException(Exception): pass
|
||||
|
||||
nwords = []
|
||||
|
||||
def parse_document(xml_content, valid_id_range=None):
|
||||
root = ET.fromstring(xml_content)
|
||||
|
||||
doc_id = root.attrib['itemid']
|
||||
if valid_id_range is not None:
|
||||
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
|
||||
raise IDRangeException
|
||||
|
||||
doc_categories = [cat.attrib['code'] for cat in
|
||||
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
|
||||
|
||||
doc_date = root.attrib['date']
|
||||
doc_title = root.find('.//title').text
|
||||
doc_headline = root.find('.//headline').text
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
|
||||
|
||||
if not doc_body:
|
||||
raise ValueError('Empty document')
|
||||
|
||||
if doc_title is None: doc_title = ''
|
||||
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
|
||||
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
|
||||
|
||||
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date)
|
||||
|
||||
|
||||
def fetch_RCV1(data_path, subset='all'):
|
||||
|
||||
assert subset in ['train', 'test', 'all'], 'split should either be "train", "test", or "all"'
|
||||
|
||||
request = []
|
||||
labels = set()
|
||||
read_documents = 0
|
||||
|
||||
training_documents = 23149
|
||||
test_documents = 781265
|
||||
|
||||
if subset == 'all':
|
||||
split_range = (2286, 810596)
|
||||
expected = training_documents+test_documents
|
||||
elif subset == 'train':
|
||||
split_range = (2286, 26150)
|
||||
expected = training_documents
|
||||
else:
|
||||
split_range = (26151, 810596)
|
||||
expected = test_documents
|
||||
|
||||
# global nwords
|
||||
# nwords=[]
|
||||
for part in list_files(data_path):
|
||||
if not re.match('\d+\.zip', part): continue
|
||||
target_file = join(data_path, part)
|
||||
assert exists(target_file), \
|
||||
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
|
||||
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
|
||||
zipfile = ZipFile(target_file)
|
||||
for xmlfile in zipfile.namelist():
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
try:
|
||||
doc = parse_document(xmlcontent, valid_id_range=split_range)
|
||||
labels.update(doc.categories)
|
||||
request.append(doc)
|
||||
read_documents += 1
|
||||
except (IDRangeException,ValueError) as e:
|
||||
pass
|
||||
print('\r[{}] read {} documents'.format(part, len(request)), end='')
|
||||
if read_documents == expected: break
|
||||
if read_documents == expected: break
|
||||
|
||||
print()
|
||||
# print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
|
||||
|
||||
return LabelledDocuments(data=[d.text for d in request], target=[d.categories for d in request], target_names=list(labels))
|
||||
|
||||
|
||||
|
||||
def fetch_topic_hierarchy(path, topics='all'):
|
||||
assert topics in ['all', 'leaves']
|
||||
|
||||
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
|
||||
hierarchy = {}
|
||||
for line in open(path, 'rt'):
|
||||
parts = line.strip().split()
|
||||
parent,child = parts[1],parts[3]
|
||||
if parent not in hierarchy:
|
||||
hierarchy[parent]=[]
|
||||
hierarchy[parent].append(child)
|
||||
|
||||
del hierarchy['None']
|
||||
del hierarchy['Root']
|
||||
print(hierarchy)
|
||||
|
||||
if topics=='all':
|
||||
topics = set(hierarchy.keys())
|
||||
for parent in hierarchy.keys():
|
||||
topics.update(hierarchy[parent])
|
||||
return list(topics)
|
||||
elif topics=='leaves':
|
||||
parents = set(hierarchy.keys())
|
||||
childs = set()
|
||||
for parent in hierarchy.keys():
|
||||
childs.update(hierarchy[parent])
|
||||
return list(childs.difference(parents))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
# example
|
||||
|
||||
RCV1_PATH = '../../datasets/RCV1-v2/unprocessed_corpus'
|
||||
|
||||
rcv1_train = fetch_RCV1(RCV1_PATH, subset='train')
|
||||
rcv1_test = fetch_RCV1(RCV1_PATH, subset='test')
|
||||
|
||||
print('read {} documents in rcv1-train, and {} labels'.format(len(rcv1_train.data), len(rcv1_train.target_names)))
|
||||
print('read {} documents in rcv1-test, and {} labels'.format(len(rcv1_test.data), len(rcv1_test.target_names)))
|
||||
|
||||
cats = Counter()
|
||||
for cats in rcv1_train.target: cats.update(cats)
|
||||
print('RCV1', cats)
|
|
@ -0,0 +1,189 @@
|
|||
# Modified version of the code originally implemented by Eustache Diemert <eustache@diemert.fr>
|
||||
# @FedericoV <https://github.com/FedericoV/>
|
||||
# with License: BSD 3 clause
|
||||
|
||||
import os.path
|
||||
import re
|
||||
import tarfile
|
||||
from sklearn.datasets import get_data_home
|
||||
from six.moves import html_parser
|
||||
from six.moves import urllib
|
||||
import pickle
|
||||
from glob import glob
|
||||
import numpy as np
|
||||
from data.labeled import LabelledDocuments
|
||||
|
||||
|
||||
def _not_in_sphinx():
|
||||
# Hack to detect whether we are running by the sphinx builder
|
||||
return '__file__' in globals()
|
||||
|
||||
|
||||
class ReutersParser(html_parser.HTMLParser):
|
||||
"""Utility class to parse a SGML file and yield documents one at a time."""
|
||||
|
||||
def __init__(self, encoding='latin-1', data_path=None):
|
||||
self.data_path = data_path
|
||||
self.download_if_not_exist()
|
||||
self.tr_docs = []
|
||||
self.te_docs = []
|
||||
html_parser.HTMLParser.__init__(self)
|
||||
self._reset()
|
||||
self.encoding = encoding
|
||||
self.empty_docs = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
method = 'start_' + tag
|
||||
getattr(self, method, lambda x: None)(attrs)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
method = 'end_' + tag
|
||||
getattr(self, method, lambda: None)()
|
||||
|
||||
def _reset(self):
|
||||
self.in_title = 0
|
||||
self.in_body = 0
|
||||
self.in_topics = 0
|
||||
self.in_topic_d = 0
|
||||
self.in_unproc_text = 0
|
||||
self.title = ""
|
||||
self.body = ""
|
||||
self.topics = []
|
||||
self.topic_d = ""
|
||||
self.text = ""
|
||||
|
||||
def parse(self, fd):
|
||||
for chunk in fd:
|
||||
self.feed(chunk.decode(self.encoding))
|
||||
self.close()
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_body:
|
||||
self.body += data
|
||||
elif self.in_title:
|
||||
self.title += data
|
||||
elif self.in_topic_d:
|
||||
self.topic_d += data
|
||||
elif self.in_unproc_text:
|
||||
self.text += data
|
||||
|
||||
def start_reuters(self, attributes):
|
||||
topic_attr = attributes[0][1]
|
||||
lewissplit_attr = attributes[1][1]
|
||||
self.lewissplit = u'unused'
|
||||
if topic_attr==u'YES':
|
||||
if lewissplit_attr == u'TRAIN':
|
||||
self.lewissplit = 'train'
|
||||
elif lewissplit_attr == u'TEST':
|
||||
self.lewissplit = 'test'
|
||||
pass
|
||||
|
||||
def end_reuters(self):
|
||||
self.body = re.sub(r'\s+', r' ', self.body)
|
||||
if self.lewissplit != u'unused':
|
||||
parsed_doc = {'title': self.title, 'body': self.body, 'unproc':self.text, 'topics': self.topics}
|
||||
if (self.title+self.body+self.text).strip() == '':
|
||||
self.empty_docs += 1
|
||||
if self.lewissplit == u'train':
|
||||
self.tr_docs.append(parsed_doc)
|
||||
elif self.lewissplit == u'test':
|
||||
self.te_docs.append(parsed_doc)
|
||||
self._reset()
|
||||
|
||||
def start_title(self, attributes):
|
||||
self.in_title = 1
|
||||
|
||||
def end_title(self):
|
||||
self.in_title = 0
|
||||
|
||||
def start_body(self, attributes):
|
||||
self.in_body = 1
|
||||
|
||||
def end_body(self):
|
||||
self.in_body = 0
|
||||
|
||||
def start_topics(self, attributes):
|
||||
self.in_topics = 1
|
||||
|
||||
def end_topics(self):
|
||||
self.in_topics = 0
|
||||
|
||||
def start_text(self, attributes):
|
||||
if len(attributes)>0 and attributes[0][1] == u'UNPROC':
|
||||
self.in_unproc_text = 1
|
||||
|
||||
def end_text(self):
|
||||
self.in_unproc_text = 0
|
||||
|
||||
def start_d(self, attributes):
|
||||
self.in_topic_d = 1
|
||||
|
||||
def end_d(self):
|
||||
if self.in_topics:
|
||||
self.topics.append(self.topic_d)
|
||||
self.in_topic_d = 0
|
||||
self.topic_d = ""
|
||||
|
||||
def download_if_not_exist(self):
|
||||
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
|
||||
'reuters21578-mld/reuters21578.tar.gz')
|
||||
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
|
||||
|
||||
if self.data_path is None:
|
||||
self.data_path = os.path.join(get_data_home(), "reuters")
|
||||
if not os.path.exists(self.data_path):
|
||||
"""Download the dataset."""
|
||||
print("downloading dataset (once and for all) into %s" % self.data_path)
|
||||
os.mkdir(self.data_path)
|
||||
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
if _not_in_sphinx():
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
|
||||
archive_path = os.path.join(self.data_path, ARCHIVE_FILENAME)
|
||||
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
|
||||
reporthook=progress)
|
||||
if _not_in_sphinx():
|
||||
print('\r', end='')
|
||||
print("untarring Reuters dataset...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(self.data_path)
|
||||
print("done.")
|
||||
|
||||
|
||||
def fetch_reuters21578(data_path=None, subset='train'):
|
||||
if data_path is None:
|
||||
data_path = os.path.join(get_data_home(), 'reuters21578')
|
||||
reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle")
|
||||
if not os.path.exists(reuters_pickle_path):
|
||||
parser = ReutersParser(data_path=data_path)
|
||||
for filename in glob(os.path.join(data_path, "*.sgm")):
|
||||
parser.parse(open(filename, 'rb'))
|
||||
# index category names with a unique numerical code (only considering categories with training examples)
|
||||
tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist()
|
||||
|
||||
def pickle_documents(docs, subset):
|
||||
for doc in docs:
|
||||
doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories]
|
||||
pickle_docs = {'categories': tr_categories, 'documents': docs}
|
||||
pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'),
|
||||
protocol=pickle.HIGHEST_PROTOCOL)
|
||||
return pickle_docs
|
||||
|
||||
pickle_tr = pickle_documents(parser.tr_docs, "train")
|
||||
pickle_te = pickle_documents(parser.te_docs, "test")
|
||||
# self.sout('Empty docs %d' % parser.empty_docs)
|
||||
requested_subset = pickle_tr if subset == 'train' else pickle_te
|
||||
else:
|
||||
requested_subset = pickle.load(open(reuters_pickle_path, 'rb'))
|
||||
|
||||
data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']]
|
||||
text_data, topics = zip(*data)
|
||||
return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories'])
|
||||
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
reuters_train = fetch_reuters21578(subset='train')
|
||||
print(reuters_train.data)
|
|
@ -0,0 +1,280 @@
|
|||
import math
|
||||
import numpy as np
|
||||
from scipy.stats import t
|
||||
from scipy.stats import norm
|
||||
from joblib import Parallel, delayed
|
||||
import time
|
||||
from scipy.sparse import csr_matrix, csc_matrix
|
||||
|
||||
|
||||
STWFUNCTIONS = ['dotn', 'ppmi', 'ig', 'chi2', 'cw', 'wp']
|
||||
|
||||
|
||||
def get_probs(tpr, fpr, pc):
|
||||
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
|
||||
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
|
||||
pnc = 1.0 - pc
|
||||
tp = tpr * pc
|
||||
fn = pc - tp
|
||||
fp = fpr * pnc
|
||||
tn = pnc - fp
|
||||
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
|
||||
|
||||
|
||||
def apply_tsr(tpr, fpr, pc, tsr):
|
||||
cell = get_probs(tpr, fpr, pc)
|
||||
return tsr(cell)
|
||||
|
||||
|
||||
def positive_information_gain(cell):
|
||||
if cell.tpr() < cell.fpr():
|
||||
return 0.0
|
||||
else:
|
||||
return information_gain(cell)
|
||||
|
||||
|
||||
def posneg_information_gain(cell):
|
||||
ig = information_gain(cell)
|
||||
if cell.tpr() < cell.fpr():
|
||||
return -ig
|
||||
else:
|
||||
return ig
|
||||
|
||||
|
||||
def __ig_factor(p_tc, p_t, p_c):
|
||||
den = p_t * p_c
|
||||
if den != 0.0 and p_tc != 0:
|
||||
return p_tc * math.log(p_tc / den, 2)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def information_gain(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
|
||||
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
|
||||
|
||||
|
||||
def information_gain_mod(cell):
|
||||
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
|
||||
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
|
||||
|
||||
|
||||
def pointwise_mutual_information(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
|
||||
|
||||
|
||||
def gain_ratio(cell):
|
||||
pc = cell.p_c()
|
||||
pnc = 1.0 - pc
|
||||
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
|
||||
return information_gain(cell) / (-norm)
|
||||
|
||||
|
||||
def chi_square(cell):
|
||||
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
|
||||
if den==0.0: return 0.0
|
||||
num = gss(cell)**2
|
||||
return num / den
|
||||
|
||||
|
||||
def relevance_frequency(cell):
|
||||
a = cell.tp
|
||||
c = cell.fp
|
||||
if c == 0: c = 1
|
||||
return math.log(2.0 + (a * 1.0 / c), 2)
|
||||
|
||||
|
||||
def idf(cell):
|
||||
if cell.p_f()>0:
|
||||
return math.log(1.0 / cell.p_f())
|
||||
return 0.0
|
||||
|
||||
|
||||
def gss(cell):
|
||||
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
|
||||
|
||||
|
||||
def conf_interval(xt, n):
|
||||
if n>30:
|
||||
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
|
||||
else:
|
||||
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
|
||||
p = (xt + 0.5 * z2) / (n + z2)
|
||||
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
|
||||
return p, amplitude
|
||||
|
||||
|
||||
def strength(minPosRelFreq, minPos, maxNeg):
|
||||
if minPos > maxNeg:
|
||||
return math.log(2.0 * minPosRelFreq, 2.0)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
|
||||
#however, for some extremely imbalanced dataset caused all documents to be 0
|
||||
def conf_weight(cell, cancel_features=False):
|
||||
c = cell.get_c()
|
||||
not_c = cell.get_not_c()
|
||||
tp = cell.tp
|
||||
fp = cell.fp
|
||||
|
||||
pos_p, pos_amp = conf_interval(tp, c)
|
||||
neg_p, neg_amp = conf_interval(fp, not_c)
|
||||
|
||||
min_pos = pos_p-pos_amp
|
||||
max_neg = neg_p+neg_amp
|
||||
den = (min_pos + max_neg)
|
||||
minpos_relfreq = min_pos / (den if den != 0 else 1)
|
||||
|
||||
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
|
||||
|
||||
if str_tplus == 0 and not cancel_features:
|
||||
return 1e-20
|
||||
|
||||
return str_tplus;
|
||||
|
||||
|
||||
def word_prob(cell):
|
||||
return cell.tpr()
|
||||
|
||||
|
||||
class ContTable:
|
||||
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
|
||||
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
|
||||
print(f'[selectiong {k} terms]')
|
||||
nC = Y.shape[1]
|
||||
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
|
||||
best_features_idx = np.argsort(-FC, axis=0).flatten()
|
||||
tsr_values = FC.flatten()
|
||||
selected_indexes_set = set()
|
||||
selected_indexes = list()
|
||||
selected_value = list()
|
||||
from_category = list()
|
||||
round_robin = iter(best_features_idx)
|
||||
values_iter = iter(tsr_values)
|
||||
round=0
|
||||
while len(selected_indexes) < k:
|
||||
term_idx = next(round_robin)
|
||||
term_val = next(values_iter)
|
||||
if term_idx not in selected_indexes_set:
|
||||
selected_indexes_set.add(term_idx)
|
||||
selected_indexes.append(term_idx)
|
||||
selected_value.append(term_val)
|
||||
from_category.append(round)
|
||||
round = (round + 1) % nC
|
||||
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
|
||||
|
||||
|
||||
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
|
||||
tp_ = len(positive_document_indexes & feature_document_indexes)
|
||||
fp_ = len(feature_document_indexes - positive_document_indexes)
|
||||
fn_ = len(positive_document_indexes - feature_document_indexes)
|
||||
tn_ = nD - (tp_ + fp_ + fn_)
|
||||
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
|
||||
|
||||
|
||||
def category_tables(feature_sets, category_sets, c, nD, nF):
|
||||
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
|
||||
|
||||
|
||||
"""
|
||||
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
|
||||
Efficiency O(nF x nC x log(S)) where S is the sparse factor
|
||||
"""
|
||||
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
|
||||
nD, nF = coocurrence_matrix.shape
|
||||
nD2, nC = label_matrix.shape
|
||||
|
||||
if nD != nD2:
|
||||
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
|
||||
(coocurrence_matrix.shape,label_matrix.shape))
|
||||
|
||||
def nonzero_set(matrix, col):
|
||||
return set(matrix[:, col].nonzero()[0])
|
||||
|
||||
if isinstance(coocurrence_matrix, csr_matrix):
|
||||
coocurrence_matrix = csc_matrix(coocurrence_matrix)
|
||||
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
|
||||
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
|
||||
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
|
||||
return np.array(cell_matrix)
|
||||
|
||||
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
|
||||
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
|
||||
nC,nF = cell_matrix.shape
|
||||
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
|
||||
return np.array(tsr_matrix)
|
||||
|
||||
|
||||
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
|
||||
take as input any real-valued feature column (e.g., tf-idf weights).
|
||||
feat is the feature vector, and c is a binary classification vector.
|
||||
This implementation covers only the binary case, while the formula is defined for multiclass
|
||||
single-label scenarios, for which the version [2] might be preferred.
|
||||
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
|
||||
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
|
||||
"""
|
||||
def fisher_score_binary(feat, c):
|
||||
neg = np.ones_like(c) - c
|
||||
|
||||
npos = np.sum(c)
|
||||
nneg = np.sum(neg)
|
||||
|
||||
mupos = np.mean(feat[c == 1])
|
||||
muneg = np.mean(feat[neg == 1])
|
||||
mu = np.mean(feat)
|
||||
|
||||
stdpos = np.std(feat[c == 1])
|
||||
stdneg = np.std(feat[neg == 1])
|
||||
|
||||
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
|
||||
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
|
||||
|
||||
if den>0:
|
||||
return num / den
|
||||
else:
|
||||
return num
|
|
@ -0,0 +1,212 @@
|
|||
#https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/
|
||||
import os, sys
|
||||
from os.path import exists, join
|
||||
from util.file import *
|
||||
from zipfile import ZipFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import pickle
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
WIPO_URL= 'https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/'
|
||||
|
||||
|
||||
class WipoGammaDocument:
|
||||
def __init__(self, id, text, main_label, all_labels):
|
||||
self.id = id
|
||||
self.text = text
|
||||
self.main_label = main_label
|
||||
self.all_labels = all_labels
|
||||
|
||||
|
||||
def remove_nested_claimtext_tags(xmlcontent):
|
||||
from_pos = xmlcontent.find(b'<claims')
|
||||
to_pos = xmlcontent.find(b'</claims>')
|
||||
if from_pos > -1 and to_pos > -1:
|
||||
in_between = xmlcontent[from_pos:to_pos].replace(b'<claim-text>',b'').replace(b'</claim-text>',b'')
|
||||
xmlcontent = (xmlcontent[:from_pos]+in_between+xmlcontent[to_pos:]).strip()
|
||||
return xmlcontent
|
||||
|
||||
|
||||
def parse_document(xml_content, text_fields, limit_description):
|
||||
root = ET.fromstring(remove_nested_claimtext_tags(xml_content))
|
||||
|
||||
doc_id = root.attrib['ucid']
|
||||
lang = root.attrib['lang']
|
||||
|
||||
#take categories from the categorization up the "sub-class" level
|
||||
main_group = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="true"]'))
|
||||
sec_groups = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="false"]'))
|
||||
sec_groups.update(main_group)
|
||||
|
||||
assert len(main_group) == 1, 'more than one main groups'
|
||||
main_group = list(main_group)[0]
|
||||
sec_groups = sorted(list(sec_groups))
|
||||
|
||||
assert lang == 'EN', f'only English documents allowed (doc {doc_id})'
|
||||
|
||||
doc_text_fields=[]
|
||||
if 'abstract' in text_fields:
|
||||
abstract = '\n'.join(filter(None, [t.text for t in root.findall('.//abstract[@lang="EN"]/p')]))
|
||||
doc_text_fields.append(abstract)
|
||||
if 'description' in text_fields:
|
||||
description = '\n'.join(filter(None, [t.text for t in root.findall('.//description[@lang="EN"]/p')]))
|
||||
if limit_description>-1:
|
||||
description=' '.join(description.split()[:limit_description])
|
||||
doc_text_fields.append(description)
|
||||
if 'claims' in text_fields:
|
||||
claims = '\n'.join(filter(None, [t.text for t in root.findall('.//claims[@lang="EN"]/claim')]))
|
||||
doc_text_fields.append(claims)
|
||||
|
||||
text = '\n'.join(doc_text_fields)
|
||||
if text:
|
||||
return WipoGammaDocument(doc_id, text, main_group, sec_groups)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def extract(fin, fout, text_fields, limit_description):
|
||||
zipfile = ZipFile(fin)
|
||||
ndocs=0
|
||||
with open(fout, 'wt') as out:
|
||||
for xmlfile in tqdm(zipfile.namelist()):
|
||||
if xmlfile.endswith('.xml'):
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
document = parse_document(xmlcontent, text_fields, limit_description)
|
||||
if document:
|
||||
line_text = document.text.replace('\n', ' ').replace('\t', ' ').strip()
|
||||
assert line_text, f'empty document in {xmlfile}'
|
||||
all_labels = ' '.join(document.all_labels)
|
||||
out.write('\t'.join([document.id, document.main_label, all_labels, line_text]))
|
||||
out.write('\n')
|
||||
ndocs+=1
|
||||
out.flush()
|
||||
|
||||
|
||||
|
||||
def read_classification_file(data_path, classification_level):
|
||||
assert classification_level in ['subclass', 'maingroup'], 'wrong classification requested'
|
||||
z = ZipFile(join(data_path,'EnglishWipoGamma1.zip'))
|
||||
inpath='Wipo_Gamma/English/TrainTestSpits'
|
||||
document_labels = dict()
|
||||
train_ids, test_ids = set(), set()
|
||||
labelcut = LabelCut(classification_level)
|
||||
for subset in tqdm(['train', 'test'], desc='loading classification file'):
|
||||
target_subset = train_ids if subset=='train' else test_ids
|
||||
if classification_level == 'subclass':
|
||||
file = f'{subset}set_en_sc.parts' #sub-class level
|
||||
else:
|
||||
file = f'{subset}set_en_mg.parts' #main-group level
|
||||
|
||||
for line in z.open(f'{inpath}/{file}').readlines():
|
||||
line = line.decode().strip().split(',')
|
||||
id = line[0]
|
||||
id = id[id.rfind('/')+1:].replace('.xml','')
|
||||
labels = labelcut.trim(line[1:])
|
||||
document_labels[id]=labels
|
||||
target_subset.add(id)
|
||||
|
||||
return document_labels, train_ids, test_ids
|
||||
|
||||
|
||||
class LabelCut:
|
||||
"""
|
||||
Labels consists of 1 char for section, 2 chars for class, 1 class for subclass, 2 chars for maingroup and so on.
|
||||
This class cuts the label at a desired level (4 for subclass, or 6 for maingroup)
|
||||
"""
|
||||
def __init__(self, classification_level):
|
||||
assert classification_level in {'subclass','maingroup'}, 'unknown classification level'
|
||||
if classification_level == 'subclass': self.cut = 4
|
||||
else: self.cut = 6
|
||||
|
||||
def trim(self, label):
|
||||
if isinstance(label, list):
|
||||
return sorted(set([l[:self.cut] for l in label]))
|
||||
else:
|
||||
return label[:self.cut]
|
||||
|
||||
|
||||
|
||||
def fetch_WIPOgamma(subset, classification_level, data_home, extracted_path, text_fields = ['abstract', 'description'], limit_description=300):
|
||||
"""
|
||||
Fetchs the WIPO-gamma dataset
|
||||
:param subset: 'train' or 'test' split
|
||||
:param classification_level: the classification level, either 'subclass' or 'maingroup'
|
||||
:param data_home: directory containing the original 11 English zips
|
||||
:param extracted_path: directory used to extract and process the original files
|
||||
:param text_fields: indicates the fields to extract, in 'abstract', 'description', 'claims'
|
||||
:param limit_description: the maximum number of words to take from the description field (default 300); set to -1 for all
|
||||
:return:
|
||||
"""
|
||||
assert subset in {"train", "test"}, 'unknown target request (valid ones are "train" or "test")'
|
||||
assert len(text_fields)>0, 'at least some text field should be indicated'
|
||||
if not exists(data_home):
|
||||
raise ValueError(f'{data_home} does not exist, and the dataset cannot be automatically download, '
|
||||
f'since you need to request for permission. Please refer to {WIPO_URL}')
|
||||
|
||||
create_if_not_exist(extracted_path)
|
||||
config = f'{"-".join(text_fields)}'
|
||||
if 'description' in text_fields: config+='-{limit_description}'
|
||||
pickle_path=join(extracted_path, f'wipo-{subset}-{classification_level}-{config}.pickle')
|
||||
if exists(pickle_path):
|
||||
print(f'loading pickled file in {pickle_path}')
|
||||
return pickle.load(open(pickle_path,'rb'))
|
||||
|
||||
print('pickle file not found, processing...(this will take some minutes)')
|
||||
extracted = sum([exists(f'{extracted_path}/EnglishWipoGamma{(i+1)}-{config}.txt') for i in range(11)])==11
|
||||
if not extracted:
|
||||
print(f'extraction files not found, extracting files in {data_home}... (this will take some additional minutes)')
|
||||
Parallel(n_jobs=-1)(
|
||||
delayed(extract)(
|
||||
join(data_home, file), join(extracted_path, file.replace('.zip', f'-{config}.txt')), text_fields, limit_description
|
||||
)
|
||||
for file in list_files(data_home)
|
||||
)
|
||||
doc_labels, train_ids, test_ids = read_classification_file(data_home, classification_level=classification_level) # or maingroup
|
||||
print(f'{len(doc_labels)} documents classified split in {len(train_ids)} train and {len(test_ids)} test documents')
|
||||
|
||||
train_request = []
|
||||
test_request = []
|
||||
pbar = tqdm([filename for filename in list_files(extracted_path) if filename.endswith(f'-{config}.txt')])
|
||||
labelcut = LabelCut(classification_level)
|
||||
errors=0
|
||||
for proc_file in pbar:
|
||||
pbar.set_description(f'processing {proc_file} [errors={errors}]')
|
||||
if not proc_file.endswith(f'-{config}.txt'): continue
|
||||
lines = open(f'{extracted_path}/{proc_file}', 'rt').readlines()
|
||||
for lineno,line in enumerate(lines):
|
||||
parts = line.split('\t')
|
||||
assert len(parts)==4, f'wrong format in {extracted_path}/{proc_file} line {lineno}'
|
||||
id,mainlabel,alllabels,text=parts
|
||||
mainlabel = labelcut.trim(mainlabel)
|
||||
alllabels = labelcut.trim(alllabels.split())
|
||||
|
||||
# assert id in train_ids or id in test_ids, f'id {id} out of scope'
|
||||
if id not in train_ids and id not in test_ids:
|
||||
errors+=1
|
||||
else:
|
||||
# assert mainlabel == doc_labels[id][0], 'main label not consistent'
|
||||
request = train_request if id in train_ids else test_request
|
||||
request.append(WipoGammaDocument(id, text, mainlabel, alllabels))
|
||||
|
||||
print('pickling requests for faster subsequent runs')
|
||||
pickle.dump(train_request, open(join(extracted_path,f'wipo-train-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
|
||||
pickle.dump(test_request, open(join(extracted_path, f'wipo-test-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
|
||||
|
||||
if subset== 'train':
|
||||
return train_request
|
||||
else:
|
||||
return test_request
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
data_home = '../../datasets/WIPO/wipo-gamma/en'
|
||||
extracted_path = '../../datasets/WIPO-extracted'
|
||||
|
||||
train = fetch_WIPOgamma(subset='train', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
|
||||
test = fetch_WIPOgamma(subset='test', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
|
||||
# train = fetch_WIPOgamma(subset='train', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
|
||||
# test = fetch_WIPOgamma(subset='test', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
|
||||
|
||||
print('Done')
|
|
@ -0,0 +1,334 @@
|
|||
from copy import deepcopy
|
||||
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, MultiTaskLassoCV, LassoLars, LassoLarsCV, \
|
||||
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.svm import LinearSVC
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
from functional import artificial_prevalence_sampling
|
||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||
from method.base import BaseQuantifier
|
||||
from quapy.data import from_rcv2_lang_file, LabelledCollection
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
|
||||
import numpy as np
|
||||
from data.dataset import Dataset
|
||||
|
||||
|
||||
|
||||
|
||||
def cls():
|
||||
# return LinearSVC()
|
||||
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
||||
|
||||
|
||||
def calibratedCls():
|
||||
return CalibratedClassifierCV(cls())
|
||||
|
||||
|
||||
class MultilabelledCollection:
|
||||
def __init__(self, instances, labels):
|
||||
assert labels.ndim==2, 'data does not seem to be multilabel'
|
||||
self.instances = instances
|
||||
self.labels = labels
|
||||
self.classes_ = np.arange(labels.shape[1])
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return MultilabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return self.instances.shape[0]
|
||||
|
||||
def prevalence(self):
|
||||
# return self.labels.mean(axis=0)
|
||||
pos = self.labels.mean(axis=0)
|
||||
neg = 1-pos
|
||||
return np.asarray([neg, pos]).T
|
||||
|
||||
def counts(self):
|
||||
return self.labels.sum(axis=0)
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return False
|
||||
|
||||
def __gen_index(self):
|
||||
return np.arange(len(self))
|
||||
|
||||
def sampling_multi_index(self, size, cat, prev=None):
|
||||
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
||||
return np.random.choice(len(self), size, replace=size>len(self))
|
||||
aux = LabelledCollection(self.__gen_index(), self.labels[:,cat])
|
||||
return aux.sampling_index(size, *[1-prev, prev])
|
||||
|
||||
def uniform_sampling_multi_index(self, size):
|
||||
return np.random.choice(len(self), size, replace=size>len(self))
|
||||
|
||||
def uniform_sampling(self, size):
|
||||
unif_index = self.uniform_sampling_multi_index(size)
|
||||
return self.sampling_from_index(unif_index)
|
||||
|
||||
def sampling(self, size, category, prev=None):
|
||||
prev_index = self.sampling_multi_index(size, category, prev)
|
||||
return self.sampling_from_index(prev_index)
|
||||
|
||||
def sampling_from_index(self, index):
|
||||
documents = self.instances[index]
|
||||
labels = self.labels[index, :]
|
||||
return MultilabelledCollection(documents, labels)
|
||||
|
||||
def train_test_split(self, train_prop=0.6, random_state=None):
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
|
||||
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||
dimensions = 2
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
|
||||
yield self.sampling(sample_size, category, prevs)
|
||||
|
||||
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||
dimensions = 2
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
|
||||
yield self.sampling_multi_index(sample_size, category, prevs)
|
||||
|
||||
def natural_sampling_generator(self, sample_size, repeats=100):
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling(sample_size)
|
||||
|
||||
def natural_sampling_index_generator(self, sample_size, repeats=100):
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling_multi_index(sample_size)
|
||||
|
||||
def asLabelledCollection(self, category):
|
||||
return LabelledCollection(self.instances, self.labels[:,category])
|
||||
|
||||
def genLabelledCollections(self):
|
||||
for c in self.classes_:
|
||||
yield self.asLabelledCollection(c)
|
||||
|
||||
@property
|
||||
def Xy(self):
|
||||
return self.instances, self.labels
|
||||
|
||||
|
||||
class MultilabelClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.norm = StandardScaler()
|
||||
|
||||
def fit(self, X, y):
|
||||
assert y.ndim==2, 'the dataset does not seem to be multi-label'
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
class MLCC:
|
||||
def __init__(self, mlcls:MultilabelClassifier):
|
||||
self.mlcls = mlcls
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.mlcls.fit(*data.Xy)
|
||||
|
||||
def quantify(self, instances):
|
||||
pred = self.mlcls.predict(instances)
|
||||
pos_prev = pred.mean(axis=0)
|
||||
neg_prev = 1-pos_prev
|
||||
return np.asarray([neg_prev, pos_prev]).T
|
||||
|
||||
|
||||
class MLPCC:
|
||||
def __init__(self, mlcls: MultilabelClassifier):
|
||||
self.mlcls = mlcls
|
||||
|
||||
def fit(self, data: MultilabelledCollection):
|
||||
self.mlcls.fit(*data.Xy)
|
||||
|
||||
def quantify(self, instances):
|
||||
pred = self.mlcls.predict_proba(instances)
|
||||
pos_prev = pred.mean(axis=0)
|
||||
neg_prev = 1 - pos_prev
|
||||
return np.asarray([neg_prev, pos_prev]).T
|
||||
|
||||
|
||||
class MultilabelQuantifier:
|
||||
def __init__(self, q:BaseQuantifier, n_jobs=-1):
|
||||
self.q = q
|
||||
self.estimators = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
|
||||
def cat_job(lc):
|
||||
return deepcopy(self.q).fit(lc)
|
||||
|
||||
self.estimators = qp.util.parallel(cat_job, data.genLabelledCollections(), n_jobs=self.n_jobs)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
pos_prevs = np.zeros(len(self.classes_), dtype=float)
|
||||
for c in self.classes_:
|
||||
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
|
||||
neg_prevs = 1-pos_prevs
|
||||
return np.asarray([neg_prevs, pos_prevs]).T
|
||||
|
||||
|
||||
class MultilabelRegressionQuantification:
|
||||
def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500, norm=True,
|
||||
means=True, stds=True):
|
||||
assert regression in ['ridge'], 'unknown regression model'
|
||||
self.estimator = MultilabelQuantifier(base_quantifier)
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.n_samples = n_samples
|
||||
self.sample_size = sample_size
|
||||
# self.norm = StandardScaler()
|
||||
self.means = means
|
||||
self.stds = stds
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
tr, te = data.train_test_split()
|
||||
self.estimator.fit(tr)
|
||||
samples_mean = []
|
||||
samples_std = []
|
||||
Xs = []
|
||||
ys = []
|
||||
for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
|
||||
ys.append(sample.prevalence()[:,1])
|
||||
Xs.append(self.estimator.quantify(sample.instances)[:,1])
|
||||
if self.means:
|
||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
||||
if self.stds:
|
||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
if self.means:
|
||||
samples_mean = np.asarray(samples_mean)
|
||||
Xs = np.hstack([Xs, samples_mean])
|
||||
if self.stds:
|
||||
samples_std = np.asarray(samples_std)
|
||||
Xs = np.hstack([Xs, samples_std])
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
self.reg.fit(Xs, ys)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
|
||||
if self.means:
|
||||
sample_mean = instances.mean(axis=0).getA()
|
||||
Xs = np.hstack([Xs, sample_mean])
|
||||
if self.stds:
|
||||
sample_std = instances.todense().std(axis=0).getA()
|
||||
Xs = np.hstack([Xs, sample_std])
|
||||
# Xs = self.norm.transform(Xs)
|
||||
adjusted = self.reg.predict(Xs)
|
||||
adjusted = np.clip(adjusted, 0, 1)
|
||||
adjusted = adjusted.flatten()
|
||||
neg_prevs = 1-adjusted
|
||||
return np.asarray([neg_prevs, adjusted]).T
|
||||
|
||||
sample_size = 250
|
||||
n_samples = 1000
|
||||
|
||||
def models():
|
||||
yield 'CC', MultilabelQuantifier(CC(cls()))
|
||||
yield 'PCC', MultilabelQuantifier(PCC(cls()))
|
||||
yield 'MLCC', MLCC(MultilabelClassifier(cls()))
|
||||
yield 'MLPCC', MLPCC(MultilabelClassifier(cls()))
|
||||
# yield 'PACC', MultilabelQuantifier(PACC(cls()))
|
||||
# yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls()))
|
||||
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True}
|
||||
# yield 'MRQ-CC', MultilabelRegressionQuantification(base_quantifier=CC(cls()), **common)
|
||||
yield 'MRQ-PCC', MultilabelRegressionQuantification(base_quantifier=PCC(cls()), **common)
|
||||
yield 'MRQ-PACC', MultilabelRegressionQuantification(base_quantifier=PACC(cls()), **common)
|
||||
|
||||
|
||||
dataset = 'reuters21578'
|
||||
data = Dataset.load(dataset, pickle_path=f'./pickles/{dataset}.pickle')
|
||||
|
||||
Xtr, Xte = data.vectorize()
|
||||
ytr = data.devel_labelmatrix.todense().getA()
|
||||
yte = data.test_labelmatrix.todense().getA()
|
||||
|
||||
most_populadted = np.argsort(ytr.sum(axis=0))[-25:]
|
||||
ytr = ytr[:, most_populadted]
|
||||
yte = yte[:, most_populadted]
|
||||
|
||||
train = MultilabelledCollection(Xtr, ytr)
|
||||
test = MultilabelledCollection(Xte, yte)
|
||||
|
||||
print(f'Train-prev: {train.prevalence()[:,1]}')
|
||||
print(f'Test-prev: {test.prevalence()[:,1]}')
|
||||
print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')
|
||||
|
||||
# print('NPP:')
|
||||
# test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=100))
|
||||
# for model_name, model in models():
|
||||
# model.fit(train)
|
||||
# errs = []
|
||||
# for index in test_indexes:
|
||||
# sample = test.sampling_from_index(index)
|
||||
# estim_prevs = model.quantify(sample.instances)
|
||||
# true_prevs = sample.prevalence()
|
||||
# errs.append(qp.error.mae(true_prevs, estim_prevs))
|
||||
# print(f'{model_name:10s}\tmae={np.mean(errs):.5f}')
|
||||
|
||||
print('APP:')
|
||||
test_indexes = []
|
||||
for cat in train.classes_:
|
||||
test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, category=cat, n_prevalences=21, repeats=10)))
|
||||
|
||||
for model_name, model in models():
|
||||
model.fit(train)
|
||||
macro_errs = []
|
||||
for cat_indexes in test_indexes:
|
||||
errs = []
|
||||
for index in cat_indexes:
|
||||
sample = test.sampling_from_index(index)
|
||||
estim_prevs = model.quantify(sample.instances)
|
||||
true_prevs = sample.prevalence()
|
||||
errs.append(qp.error.mae(true_prevs, estim_prevs))
|
||||
macro_errs.append(np.mean(errs))
|
||||
print(f'{model_name:10s}\tmae={np.mean(macro_errs):.5f}')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,145 @@
|
|||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from scipy.sparse import vstack, issparse
|
||||
from joblib import Parallel, delayed
|
||||
import multiprocessing
|
||||
import itertools
|
||||
|
||||
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
"""
|
||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
||||
:param data: list of string documents
|
||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
||||
are not in the original vocab but that are in the known_words
|
||||
:return:
|
||||
"""
|
||||
indexes=[]
|
||||
vocabsize = len(vocab)
|
||||
unk_count = 0
|
||||
knw_count = 0
|
||||
out_count = 0
|
||||
pbar = tqdm(data, desc=f'indexing documents')
|
||||
for text in pbar:
|
||||
words = analyzer(text)
|
||||
index = []
|
||||
for word in words:
|
||||
if word in vocab:
|
||||
idx = vocab[word]
|
||||
else:
|
||||
if word in known_words:
|
||||
if word not in out_of_vocabulary:
|
||||
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
||||
idx = out_of_vocabulary[word]
|
||||
out_count += 1
|
||||
else:
|
||||
idx = unk_index
|
||||
unk_count += 1
|
||||
index.append(idx)
|
||||
indexes.append(index)
|
||||
knw_count += len(index)
|
||||
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
||||
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
||||
return indexes
|
||||
|
||||
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths)+np.std(lengths))
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i,indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
def get_word_list(word2index1, word2index2=None): #TODO: redo
|
||||
def extract_word_list(word2index):
|
||||
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
|
||||
word_list = extract_word_list(word2index1)
|
||||
if word2index2 is not None:
|
||||
word_list += extract_word_list(word2index2)
|
||||
return word_list
|
||||
|
||||
|
||||
def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500):
|
||||
nsamples = len(index_list)
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
batch = index_list[b*batchsize:(b+1)*batchsize]
|
||||
batch_labels = labels[b*batchsize:(b+1)*batchsize]
|
||||
if issparse(batch_labels):
|
||||
batch_labels = batch_labels.toarray()
|
||||
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
totype = torch.LongTensor if target_long else torch.FloatTensor
|
||||
target = totype(batch_labels)
|
||||
yield batch.to(device), target.to(device)
|
||||
|
||||
|
||||
def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500):
|
||||
nsamples = len(index_list)
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
batch = index_list[b*batchsize:(b+1)*batchsize]
|
||||
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
yield batch.to(device)
|
||||
|
||||
|
||||
def clip_gradient(model, clip_value=1e-1):
|
||||
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
||||
for p in params:
|
||||
p.grad.data.clamp_(-clip_value, clip_value)
|
||||
|
||||
|
||||
def predict(logits, classification_type='singlelabel'):
|
||||
if classification_type == 'multilabel':
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == 'singlelabel':
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print('unknown classification type')
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def get_parallel_slices(n_tasks, n_jobs=-1):
|
||||
if n_jobs==-1:
|
||||
n_jobs = multiprocessing.cpu_count()
|
||||
batch = int(n_tasks / n_jobs)
|
||||
remainder = n_tasks % n_jobs
|
||||
return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
|
||||
|
||||
|
||||
def tokenize_job(documents, tokenizer, max_tokens, job):
|
||||
return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')]
|
||||
|
||||
|
||||
def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1):
|
||||
slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs)
|
||||
tokens = Parallel(n_jobs=n_jobs)(
|
||||
delayed(tokenize_job)(
|
||||
documents[slice_i], tokenizer, max_tokens, job
|
||||
)
|
||||
for job, slice_i in enumerate(slices)
|
||||
)
|
||||
return list(itertools.chain.from_iterable(tokens))
|
||||
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
|
||||
class CSVLog:
|
||||
|
||||
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
|
||||
self.file = file
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file) and not overwrite:
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
self.columns = sorted(self.df.columns.values.tolist())
|
||||
else:
|
||||
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
|
||||
assert columns is not None, 'columns cannot be None'
|
||||
self.columns = sorted(columns)
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
self.defaults = {}
|
||||
|
||||
def already_calculated(self, **kwargs):
|
||||
df = self.df
|
||||
if df.shape[0] == 0:
|
||||
return False
|
||||
if len(kwargs) == 0:
|
||||
kwargs = self.defaults
|
||||
for key,val in kwargs.items():
|
||||
df = df.loc[df[key] == val]
|
||||
if df.shape[0] == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
def set_default(self, param, value):
|
||||
self.defaults[param] = value
|
||||
|
||||
def add_row(self, **kwargs):
|
||||
for key in self.defaults.keys():
|
||||
if key not in kwargs:
|
||||
kwargs[key]=self.defaults[key]
|
||||
colums = sorted(list(kwargs.keys()))
|
||||
values = [kwargs[col_i] for col_i in colums]
|
||||
s = pd.Series(values, index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(kwargs)
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
from data.dataset import Dataset
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
def write_data(documents, labels, fout):
|
||||
print(f'there are {len(documents)} documents')
|
||||
written, empty = 0, 0
|
||||
with open(fout, 'wt') as foo:
|
||||
for doc, label in tqdm(list(zip(documents, labels))):
|
||||
doc = doc.replace('\t', ' ').replace('\n', ' ').strip()
|
||||
label = np.squeeze(np.asarray(label.todense()))
|
||||
label = ' '.join([f'{x}' for x in label])
|
||||
if doc:
|
||||
foo.write(f'{label}\t{doc}\n')
|
||||
written += 1
|
||||
else:
|
||||
foo.write(f'{label}\tempty document\n')
|
||||
empty += 1
|
||||
print(f'written = {written}')
|
||||
print(f'empty = {empty}')
|
||||
|
||||
|
||||
for dataset_name in ['reuters21578', 'ohsumed', 'jrcall', 'rcv1', 'wipo-sl-sc']: #'20newsgroups'
|
||||
|
||||
dataset = Dataset.load(dataset_name=dataset_name, pickle_path=f'../pickles/{dataset_name}.pickle').show()
|
||||
|
||||
os.makedirs(f'../leam/{dataset_name}', exist_ok=True)
|
||||
write_data(dataset.devel_raw, dataset.devel_labelmatrix, f'../leam/{dataset_name}/train.csv')
|
||||
#write_data(dataset.test_raw, dataset.test_labelmatrix, f'../leam/{dataset_name}/test.csv')
|
||||
print('done')
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
def warn(*args, **kwargs): pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
|
@ -0,0 +1,54 @@
|
|||
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
|
||||
import torch
|
||||
from time import time
|
||||
from util.file import create_if_not_exist
|
||||
|
||||
|
||||
class EarlyStopping:
|
||||
|
||||
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||
self.patience_limit = patience
|
||||
self.patience = patience
|
||||
self.verbose = verbose
|
||||
self.best_score = None
|
||||
self.best_epoch = None
|
||||
self.stop_time = None
|
||||
self.checkpoint = checkpoint
|
||||
self.model = model
|
||||
self.STOP = False
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
|
||||
if self.STOP:
|
||||
return #done
|
||||
|
||||
if self.best_score is None or watch_score >= self.best_score:
|
||||
self.best_score = watch_score
|
||||
self.best_epoch = epoch
|
||||
self.stop_time = time()
|
||||
if self.checkpoint:
|
||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||
torch.save(self.model, self.checkpoint)
|
||||
else:
|
||||
self.print(f'[early-stop] improved')
|
||||
self.patience = self.patience_limit
|
||||
else:
|
||||
self.patience -= 1
|
||||
if self.patience == 0:
|
||||
self.STOP = True
|
||||
self.print(f'[early-stop] patience exhausted')
|
||||
else:
|
||||
if self.patience>0: # if negative, then early-stop is ignored
|
||||
self.print(f'[early-stop] patience={self.patience}')
|
||||
|
||||
def reinit_counter(self):
|
||||
self.STOP = False
|
||||
self.patience=self.patience_limit
|
||||
|
||||
def restore_checkpoint(self):
|
||||
return torch.load(self.checkpoint)
|
||||
|
||||
def print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
|
@ -0,0 +1,38 @@
|
|||
import urllib.request
|
||||
from os import listdir, makedirs
|
||||
from os.path import isdir, isfile, join, exists, dirname
|
||||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
print("Downloading %s" % url)
|
||||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||
print("")
|
||||
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if exists(archive_path): return
|
||||
create_if_not_exist(dirname(archive_path))
|
||||
download_file(url,archive_path)
|
||||
|
||||
|
||||
def ls(dir, typecheck):
|
||||
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
|
||||
el.sort()
|
||||
return el
|
||||
|
||||
|
||||
def list_dirs(dir):
|
||||
return ls(dir, typecheck=isdir)
|
||||
|
||||
|
||||
def list_files(dir):
|
||||
return ls(dir, typecheck=isfile)
|
||||
|
||||
|
||||
def create_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import lil_matrix, issparse
|
||||
from sklearn.metrics import f1_score, accuracy_score
|
||||
|
||||
|
||||
"""
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
|
||||
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
"""
|
||||
|
||||
def evaluation(y_true, y_pred, classification_type):
|
||||
|
||||
if classification_type == 'multilabel':
|
||||
eval_function = multilabel_eval
|
||||
elif classification_type == 'singlelabel':
|
||||
eval_function = singlelabel_eval
|
||||
|
||||
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
|
||||
|
||||
return Mf1, mf1, accuracy
|
||||
|
||||
|
||||
def multilabel_eval(y, y_):
|
||||
|
||||
tp = y.multiply(y_)
|
||||
|
||||
fn = lil_matrix(y.shape)
|
||||
true_ones = y==1
|
||||
fn[true_ones]=1-tp[true_ones]
|
||||
|
||||
fp = lil_matrix(y.shape)
|
||||
pred_ones = y_==1
|
||||
if pred_ones.nnz>0:
|
||||
fp[pred_ones]=1-tp[pred_ones]
|
||||
|
||||
#macro-f1
|
||||
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
|
||||
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
|
||||
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
|
||||
|
||||
pos_pred = tp_macro+fp_macro
|
||||
pos_true = tp_macro+fn_macro
|
||||
prec=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
rec=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
|
||||
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
|
||||
den=prec+rec
|
||||
|
||||
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
|
||||
macrof1 *=2
|
||||
|
||||
macrof1[(pos_pred==0)*(pos_true==0)]=1
|
||||
macrof1 = np.mean(macrof1)
|
||||
|
||||
#micro-f1
|
||||
tp_micro = tp_macro.sum()
|
||||
fn_micro = fn_macro.sum()
|
||||
fp_micro = fp_macro.sum()
|
||||
pos_pred = tp_micro + fp_micro
|
||||
pos_true = tp_micro + fn_micro
|
||||
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
|
||||
rec = (tp_micro / pos_true) if pos_true>0 else 0
|
||||
den = prec+rec
|
||||
microf1 = 2*prec*rec/den if den>0 else 0
|
||||
if pos_pred==pos_true==0:
|
||||
microf1=1
|
||||
|
||||
#accuracy
|
||||
ndecisions = np.multiply(*y.shape)
|
||||
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
|
||||
acc = (tp_micro+tn)/ndecisions
|
||||
|
||||
return macrof1,microf1,acc
|
||||
|
||||
|
||||
def singlelabel_eval(y, y_):
|
||||
if issparse(y_): y_ = y_.toarray().flatten()
|
||||
macrof1 = f1_score(y, y_, average='macro')
|
||||
microf1 = f1_score(y, y_, average='micro')
|
||||
acc = accuracy_score(y, y_)
|
||||
return macrof1,microf1,acc
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
from sklearn.svm import LinearSVC
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from time import time
|
||||
|
||||
|
||||
class MLSVC:
|
||||
"""
|
||||
Multi-Label Support Vector Machine, with individual optimizations per binary problem.
|
||||
"""
|
||||
|
||||
def __init__(self, n_jobs=1, estimator=LinearSVC, *args, **kwargs):
|
||||
self.n_jobs = n_jobs
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
self.verbose = False if 'verbose' not in self.kwargs else self.kwargs['verbose']
|
||||
self.estimator = estimator
|
||||
|
||||
|
||||
def fit(self, X, y, **grid_search_params):
|
||||
tini = time()
|
||||
assert len(y.shape)==2 and set(np.unique(y).tolist()) == {0,1}, 'data format is not multi-label'
|
||||
nD,nC = y.shape
|
||||
prevalence = np.sum(y, axis=0)
|
||||
self.svms = np.array([self.estimator(*self.args, **self.kwargs) for _ in range(nC)])
|
||||
if grid_search_params and grid_search_params['param_grid']:
|
||||
self._print('grid_search activated with: {}'.format(grid_search_params))
|
||||
# Grid search cannot be performed if the category prevalence is less than the parameter cv.
|
||||
# In those cases we place a svm instead of a gridsearchcv
|
||||
cv = 5 if 'cv' not in grid_search_params else grid_search_params['cv']
|
||||
assert isinstance(cv, int), 'cv must be an int (other policies are not supported yet)'
|
||||
self.svms = [GridSearchCV(svm_i, refit=True, **grid_search_params) if prevalence[i]>=cv else svm_i
|
||||
for i,svm_i in enumerate(self.svms)]
|
||||
for i in np.argwhere(prevalence==0).flatten():
|
||||
self.svms[i] = TrivialRejector()
|
||||
|
||||
self.svms = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(self.svms[c].fit)(X,y[:,c]) for c,svm in enumerate(self.svms)
|
||||
)
|
||||
self.training_time = time() - tini
|
||||
|
||||
|
||||
def predict(self, X):
|
||||
return np.vstack(list(map(lambda svmi: svmi.predict(X), self.svms))).T
|
||||
|
||||
|
||||
def predict_proba(self, X):
|
||||
return np.vstack(map(lambda svmi: svmi.predict_proba(X)[:,np.argwhere(svmi.classes_==1)[0,0]], self.svms)).T
|
||||
|
||||
|
||||
def _print(self, msg):
|
||||
if self.verbose>0:
|
||||
print(msg)
|
||||
|
||||
|
||||
def best_params(self):
|
||||
return [svmi.best_params_ if isinstance(svmi, GridSearchCV) else None for svmi in self.svms]
|
||||
|
||||
|
||||
class TrivialRejector:
|
||||
def fit(self,*args,**kwargs): return self
|
||||
def predict(self, X): return np.zeros(X.shape[0])
|
||||
def predict_proba(self, X): return np.zeros(X.shape[0])
|
||||
|
224
multi_label.py
224
multi_label.py
|
@ -1,224 +0,0 @@
|
|||
from copy import deepcopy
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import quapy as qp
|
||||
from functional import artificial_prevalence_sampling
|
||||
from method.aggregative import PACC, CC, EMQ
|
||||
from method.base import BaseQuantifier
|
||||
from quapy.data import from_rcv2_lang_file, LabelledCollection
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
import numpy as np
|
||||
|
||||
|
||||
class MultilabelledCollection:
|
||||
def __init__(self, instances, labels):
|
||||
assert labels.ndim==2, 'data does not seem to be multilabel'
|
||||
self.instances = instances
|
||||
self.labels = labels
|
||||
self.classes_ = np.arange(labels.shape[1])
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return MultilabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return self.instances.shape[0]
|
||||
|
||||
def prevalence(self):
|
||||
# return self.labels.mean(axis=0)
|
||||
pos = self.labels.mean(axis=0)
|
||||
neg = 1-pos
|
||||
return np.asarray([neg, pos]).T
|
||||
|
||||
def counts(self):
|
||||
return self.labels.sum(axis=0)
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return False
|
||||
|
||||
def __gen_index(self):
|
||||
return np.arange(len(self))
|
||||
|
||||
def sampling_multi_index(self, size, cat, prev=None):
|
||||
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
||||
return np.random.choice(len(self), size, replace=size>len(self))
|
||||
aux = LabelledCollection(self.__gen_index(), self.instances[:,cat])
|
||||
return aux.sampling_index(size, *[1-prev, prev])
|
||||
|
||||
def uniform_sampling_multi_index(self, size):
|
||||
return np.random.choice(len(self), size, replace=size>len(self))
|
||||
|
||||
def uniform_sampling(self, size):
|
||||
unif_index = self.uniform_sampling_multi_index(size)
|
||||
return self.sampling_from_index(unif_index)
|
||||
|
||||
def sampling(self, size, category, prev=None):
|
||||
prev_index = self.sampling_multi_index(size, category, prev)
|
||||
return self.sampling_from_index(prev_index)
|
||||
|
||||
def sampling_from_index(self, index):
|
||||
documents = self.instances[index]
|
||||
labels = self.labels[index, :]
|
||||
return MultilabelledCollection(documents, labels)
|
||||
|
||||
def train_test_split(self, train_prop=0.6, random_state=None):
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
|
||||
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||
dimensions = 2
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling(sample_size, category, prevs[1])
|
||||
|
||||
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||
dimensions = 2
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling_multi_index(sample_size, category, prevs[1])
|
||||
|
||||
def natural_sampling_generator(self, sample_size, repeats=100):
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling(sample_size)
|
||||
|
||||
def natural_sampling_index_generator(self, sample_size, repeats=100):
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling_multi_index(sample_size)
|
||||
|
||||
def asLabelledCollection(self, category):
|
||||
return LabelledCollection(self.instances, self.labels[:,category])
|
||||
|
||||
def genLabelledCollections(self):
|
||||
for c in self.classes_:
|
||||
yield self.asLabelledCollection(c)
|
||||
|
||||
@property
|
||||
def Xy(self):
|
||||
return self.instances, self.labels
|
||||
|
||||
|
||||
class MultilabelQuantifier:
|
||||
def __init__(self, q:BaseQuantifier):
|
||||
self.q = q
|
||||
self.estimators = {}
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
for cat, lc in enumerate(data.genLabelledCollections()):
|
||||
self.estimators[cat] = deepcopy(self.q).fit(lc)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
pos_prevs = np.zeros(len(self.classes_), dtype=float)
|
||||
for c in self.classes_:
|
||||
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
|
||||
neg_prevs = 1-pos_prevs
|
||||
return np.asarray([neg_prevs, pos_prevs]).T
|
||||
|
||||
|
||||
class MultilabelRegressionQuantification:
|
||||
def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500):
|
||||
self.estimator = MultilabelQuantifier(base_quantifier)
|
||||
self.regression = regression
|
||||
self.n_samples = n_samples
|
||||
self.sample_size = sample_size
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
tr, te = data.train_test_split()
|
||||
self.estimator.fit(tr)
|
||||
Xs = []
|
||||
ys = []
|
||||
for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
|
||||
ys.append(sample.prevalence()[:,1])
|
||||
Xs.append(self.estimator.quantify(sample.instances)[:,1])
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
print(f'Xs in {Xs.shape}')
|
||||
print(f'ys in {ys.shape}')
|
||||
self.reg = Ridge().fit(Xs, ys) #normalize?
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
|
||||
adjusted = self.reg.predict(Xs)
|
||||
adjusted = np.clip(adjusted, 0, 1)
|
||||
adjusted = adjusted.flatten()
|
||||
neg_prevs = 1-adjusted
|
||||
return np.asarray([neg_prevs, adjusted]).T
|
||||
|
||||
|
||||
|
||||
# read documents
|
||||
path = f'./crosslingual_data/rcv12/en.small.txt'
|
||||
docs, cats = from_rcv2_lang_file(path)
|
||||
|
||||
# split train-test
|
||||
tr_docs, te_docs, tr_cats, te_cats = train_test_split(docs, cats, test_size=0.2, random_state=42)
|
||||
|
||||
# generate Y matrices
|
||||
mlb = MultiLabelBinarizer()
|
||||
ytr = mlb.fit_transform([cats.split(' ') for cats in tr_cats])
|
||||
yte = mlb.transform([cats.split(' ') for cats in te_cats])
|
||||
# retain 10 most populated categories
|
||||
most_populated = np.argsort(ytr.sum(axis=0))[-10:]
|
||||
ytr = ytr[:,most_populated]
|
||||
yte = yte[:,most_populated]
|
||||
|
||||
tfidf = TfidfVectorizer(min_df=5)
|
||||
Xtr = tfidf.fit_transform(tr_docs)
|
||||
Xte = tfidf.transform(te_docs)
|
||||
|
||||
train = MultilabelledCollection(Xtr, ytr)
|
||||
test = MultilabelledCollection(Xte, yte)
|
||||
|
||||
model = MultilabelQuantifier(PACC(LogisticRegression()))
|
||||
model.fit(train)
|
||||
estim_prevs = model.quantify(test.instances)
|
||||
true_prevs = test.prevalence()
|
||||
print('PACC:')
|
||||
print(estim_prevs)
|
||||
print(true_prevs)
|
||||
|
||||
|
||||
model = MultilabelQuantifier(CC(LogisticRegression()))
|
||||
model.fit(train)
|
||||
estim_prevs = model.quantify(test.instances)
|
||||
true_prevs = test.prevalence()
|
||||
print('CC:')
|
||||
print(estim_prevs)
|
||||
print(true_prevs)
|
||||
|
||||
|
||||
# model = MultilabelQuantifier(EMQ(LogisticRegression()))
|
||||
# model.fit(train)
|
||||
# estim_prevs = model.quantify(test.instances)
|
||||
# true_prevs = test.prevalence()
|
||||
# print('EMQ:')
|
||||
# print(estim_prevs)
|
||||
# print(true_prevs)
|
||||
|
||||
model = MultilabelRegressionQuantification(sample_size=200, n_samples=500)
|
||||
model.fit(train)
|
||||
estim_prevs = model.quantify(test.instances)
|
||||
true_prevs = test.prevalence()
|
||||
print('MRQ:')
|
||||
print(estim_prevs)
|
||||
print(true_prevs)
|
||||
|
||||
qp.environ['SAMPLE_SIZE']=100
|
||||
mae = qp.error.mae(true_prevs, estim_prevs)
|
||||
print(mae)
|
||||
|
||||
|
||||
|
|
@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix
|
|||
from tqdm import tqdm
|
||||
|
||||
|
||||
def from_rcv2_lang_file(path, encoding='utf-8'):
|
||||
lines = open(path, 'rt', encoding=encoding).readlines()
|
||||
parts = [l.split('\t') for l in lines]
|
||||
docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts]))
|
||||
return docs, cats
|
||||
|
||||
|
||||
def from_text(path, encoding='utf-8'):
|
||||
"""
|
||||
Reas a labelled colletion of documents.
|
||||
|
|
|
@ -105,7 +105,7 @@ def _predict_from_indexes(
|
|||
estim_prevalence = quantification_func(sample.instances)
|
||||
return true_prevalence, estim_prevalence
|
||||
|
||||
pbar = tqdm(indexes, desc='[artificial sampling protocol] generating predictions') if verbose else indexes
|
||||
pbar = tqdm(indexes, desc='[sampling protocol] generating predictions') if verbose else indexes
|
||||
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
|
||||
|
||||
true_prevalences, estim_prevalences = zip(*results)
|
||||
|
|
|
@ -227,7 +227,7 @@ def _delayed_new_instance(args):
|
|||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
||||
data, val_split = data.split_stratified(train_prop=1 - val_split)
|
||||
data, val_split = data.train_test_split(train_prop=1 - val_split)
|
||||
|
||||
sample_index = data.sampling_index(sample_size, *prev)
|
||||
sample = data.sampling_from_index(sample_index)
|
||||
|
|
|
@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
|
||||
if fit_learner:
|
||||
classifier_data, unused_data = data.split_stratified(0.4)
|
||||
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||
train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||
self.learner.fit(*classifier_data.Xy)
|
||||
else:
|
||||
classifier_data = None
|
||||
|
|
|
@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
return training, validation
|
||||
elif isinstance(validation, float):
|
||||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
||||
training, validation = training.train_test_split(train_prop=1 - validation)
|
||||
return training, validation
|
||||
else:
|
||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||
|
|
Loading…
Reference in New Issue