forked from moreo/QuaPy
Compare commits
22 Commits
master
...
crosslingu
|
@ -0,0 +1,38 @@
|
|||
Classifiers
|
||||
|
||||
- Classifiers binary, single-label, OneVsRest or MultiOutput:
|
||||
- LR
|
||||
- LinearSVC (?)
|
||||
|
||||
- Classifiers natively multi-label:
|
||||
- from scikit-multilearn (x11)
|
||||
-
|
||||
|
||||
Protocols:
|
||||
- NPP
|
||||
- APP (for each class)
|
||||
|
||||
|
||||
|
||||
Things to test:
|
||||
- MultiChain for classification, MultiChain for regression...
|
||||
- Reimplement stacking with sklearn.ensemble.StackingClassifier? No parece facil.
|
||||
|
||||
- Independent classifiers + independent quantifiers
|
||||
- Stacking + independent quantifiers
|
||||
- ClassifierChain + independent quantifiers
|
||||
- Independent quantifiers + cross-class regression (independent?)
|
||||
- Stacking + cross-class regression
|
||||
- ClassifierChain + cross-class regression
|
||||
- Covariates (Means, CovMatrix from samples) + multioutput regression?
|
||||
- Covariates concatented with quantifiers predictions + cross-class regression?
|
||||
|
||||
- Model Selection for specific protocols?
|
||||
|
||||
TODO:
|
||||
- decide methods
|
||||
- decide classifiers binary
|
||||
- decide classifiers multi-label
|
||||
- decide quantifiers naive
|
||||
- decide quantifiers multi-label
|
||||
- decide datasets
|
|
@ -0,0 +1,229 @@
|
|||
import os,sys
|
||||
from sklearn.datasets import get_data_home, fetch_20newsgroups
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from MultiLabel.data.jrcacquis_reader import fetch_jrcacquis
|
||||
from MultiLabel.data.ohsumed_reader import fetch_ohsumed50k
|
||||
from MultiLabel.data.reuters21578_reader import fetch_reuters21578
|
||||
from MultiLabel.data.rcv_reader import fetch_RCV1
|
||||
from MultiLabel.data.wipo_reader import fetch_WIPOgamma, WipoGammaDocument
|
||||
import pickle
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from os.path import join
|
||||
import re
|
||||
|
||||
|
||||
def init_vectorizer():
|
||||
return TfidfVectorizer(min_df=5, sublinear_tf=True)
|
||||
|
||||
|
||||
class Dataset:
|
||||
|
||||
dataset_available = {'reuters21578', '20newsgroups', 'ohsumed', 'rcv1', 'ohsumed', 'jrcall',
|
||||
'wipo-sl-mg','wipo-ml-mg','wipo-sl-sc','wipo-ml-sc'}
|
||||
|
||||
def __init__(self, name):
|
||||
assert name in Dataset.dataset_available, f'dataset {name} is not available'
|
||||
if name=='reuters21578':
|
||||
self._load_reuters()
|
||||
elif name == '20newsgroups':
|
||||
self._load_20news()
|
||||
elif name == 'rcv1':
|
||||
self._load_rcv1()
|
||||
elif name == 'ohsumed':
|
||||
self._load_ohsumed()
|
||||
elif name == 'jrcall':
|
||||
self._load_jrc(version='all')
|
||||
elif name == 'wipo-sl-mg':
|
||||
self._load_wipo('singlelabel', 'maingroup')
|
||||
elif name == 'wipo-ml-mg':
|
||||
self._load_wipo('multilabel', 'maingroup')
|
||||
elif name == 'wipo-sl-sc':
|
||||
self._load_wipo('singlelabel', 'subclass')
|
||||
elif name == 'wipo-ml-sc':
|
||||
self._load_wipo('multilabel', 'subclass')
|
||||
|
||||
self.nC = self.devel_labelmatrix.shape[1]
|
||||
self._vectorizer = init_vectorizer()
|
||||
self._vectorizer.fit(self.devel_raw)
|
||||
self.vocabulary = self._vectorizer.vocabulary_
|
||||
|
||||
def show(self):
|
||||
nTr_docs = len(self.devel_raw)
|
||||
nTe_docs = len(self.test_raw)
|
||||
nfeats = len(self._vectorizer.vocabulary_)
|
||||
nC = self.devel_labelmatrix.shape[1]
|
||||
nD=nTr_docs+nTe_docs
|
||||
print(f'{self.classification_type}, nD={nD}=({nTr_docs}+{nTe_docs}), nF={nfeats}, nC={nC}')
|
||||
return self
|
||||
|
||||
def _load_reuters(self):
|
||||
data_path = os.path.join(get_data_home(), 'reuters21578')
|
||||
devel = fetch_reuters21578(subset='train', data_path=data_path)
|
||||
test = fetch_reuters21578(subset='test', data_path=data_path)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_rcv1(self):
|
||||
data_path = '../datasets/RCV1-v2/unprocessed_corpus' #TODO: check when missing
|
||||
devel = fetch_RCV1(subset='train', data_path=data_path)
|
||||
test = fetch_RCV1(subset='test', data_path=data_path)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_jrc(self, version):
|
||||
assert version in ['300','all'], 'allowed versions are "300" or "all"'
|
||||
data_path = "../datasets/JRC_Acquis_v3"
|
||||
tr_years=list(range(1986, 2006))
|
||||
te_years=[2006]
|
||||
if version=='300':
|
||||
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1,most_frequent=300)
|
||||
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
|
||||
else:
|
||||
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1)
|
||||
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
|
||||
print(f'load jrc-acquis (English) with {len(tr_cats)} tr categories ({len(te_cats)} te categories)')
|
||||
|
||||
devel_data = JRCAcquis_Document.get_text(training_docs)
|
||||
test_data = JRCAcquis_Document.get_text(test_docs)
|
||||
devel_target = JRCAcquis_Document.get_target(training_docs)
|
||||
test_target = JRCAcquis_Document.get_target(test_docs)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_ohsumed(self):
|
||||
data_path = os.path.join(get_data_home(), 'ohsumed50k')
|
||||
devel = fetch_ohsumed50k(subset='train', data_path=data_path)
|
||||
test = fetch_ohsumed50k(subset='test', data_path=data_path)
|
||||
|
||||
self.classification_type = 'multilabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
|
||||
def _load_20news(self):
|
||||
metadata = ('headers', 'footers', 'quotes')
|
||||
devel = fetch_20newsgroups(subset='train', remove=metadata)
|
||||
test = fetch_20newsgroups(subset='test', remove=metadata)
|
||||
self.classification_type = 'singlelabel'
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
|
||||
self.devel_target, self.test_target = devel.target, test.target
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1,1), self.test_target.reshape(-1,1))
|
||||
|
||||
def _load_fasttext_data(self,name):
|
||||
data_path='../datasets/fastText'
|
||||
self.classification_type = 'singlelabel'
|
||||
name=name.replace('-','_')
|
||||
train_file = join(data_path,f'{name}.train')
|
||||
assert os.path.exists(train_file), f'file {name} not found, please place the fasttext data in {data_path}' #' or specify the path' #todo
|
||||
self.devel_raw, self.devel_target = load_fasttext_format(train_file)
|
||||
self.test_raw, self.test_target = load_fasttext_format(join(data_path, f'{name}.test'))
|
||||
self.devel_raw = mask_numbers(self.devel_raw)
|
||||
self.test_raw = mask_numbers(self.test_raw)
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
|
||||
|
||||
def _load_wipo(self, classmode, classlevel):
|
||||
assert classmode in {'singlelabel', 'multilabel'}, 'available class_mode are sl (single-label) or ml (multi-label)'
|
||||
data_path = '../datasets/WIPO/wipo-gamma/en'
|
||||
data_proc = '../datasets/WIPO-extracted'
|
||||
|
||||
devel = fetch_WIPOgamma(subset='train', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
|
||||
test = fetch_WIPOgamma(subset='test', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
|
||||
|
||||
devel_data = [d.text for d in devel]
|
||||
test_data = [d.text for d in test]
|
||||
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
|
||||
|
||||
self.classification_type = classmode
|
||||
if classmode== 'multilabel':
|
||||
devel_target = [d.all_labels for d in devel]
|
||||
test_target = [d.all_labels for d in test]
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
|
||||
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
|
||||
else:
|
||||
devel_target = [d.main_label for d in devel]
|
||||
test_target = [d.main_label for d in test]
|
||||
# only for labels with at least one training document
|
||||
class_id = {labelname:index for index,labelname in enumerate(sorted(set(devel_target)))}
|
||||
devel_target = np.array([class_id[id] for id in devel_target]).astype(int)
|
||||
test_target = np.array([class_id.get(id,None) for id in test_target])
|
||||
if None in test_target:
|
||||
print(f'deleting {(test_target==None).sum()} test documents without valid categories')
|
||||
keep_pos = test_target!=None
|
||||
self.test_raw = (np.asarray(self.test_raw)[keep_pos]).tolist()
|
||||
test_target = test_target[keep_pos]
|
||||
test_target=test_target.astype(int)
|
||||
self.devel_target, self.test_target = devel_target, test_target
|
||||
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
|
||||
|
||||
def vectorize(self):
|
||||
if not hasattr(self, 'Xtr') or not hasattr(self, 'Xte'):
|
||||
self.Xtr = self._vectorizer.transform(self.devel_raw)
|
||||
self.Xte = self._vectorizer.transform(self.test_raw)
|
||||
self.Xtr.sort_indices()
|
||||
self.Xte.sort_indices()
|
||||
return self.Xtr, self.Xte
|
||||
|
||||
def analyzer(self):
|
||||
return self._vectorizer.build_analyzer()
|
||||
|
||||
@classmethod
|
||||
def load(cls, dataset_name, pickle_path=None):
|
||||
|
||||
if pickle_path:
|
||||
if os.path.exists(pickle_path):
|
||||
print(f'loading pickled dataset from {pickle_path}')
|
||||
dataset = pickle.load(open(pickle_path, 'rb'))
|
||||
else:
|
||||
print(f'fetching dataset and dumping it into {pickle_path}')
|
||||
dataset = Dataset(name=dataset_name)
|
||||
print('vectorizing for faster processing')
|
||||
dataset.vectorize()
|
||||
print('dumping')
|
||||
pickle.dump(dataset, open(pickle_path, 'wb', pickle.HIGHEST_PROTOCOL))
|
||||
else:
|
||||
print(f'loading dataset {dataset_name}')
|
||||
dataset = Dataset(name=dataset_name)
|
||||
|
||||
print('[Done]')
|
||||
return dataset
|
||||
|
||||
|
||||
def _label_matrix(tr_target, te_target):
|
||||
mlb = MultiLabelBinarizer(sparse_output=True)
|
||||
ytr = mlb.fit_transform(tr_target)
|
||||
yte = mlb.transform(te_target)
|
||||
print(mlb.classes_)
|
||||
return ytr, yte
|
||||
|
||||
|
||||
def load_fasttext_format(path):
|
||||
print(f'loading {path}')
|
||||
labels,docs=[],[]
|
||||
for line in tqdm(open(path, 'rt').readlines()):
|
||||
space = line.strip().find(' ')
|
||||
label = int(line[:space].replace('__label__',''))-1
|
||||
labels.append(label)
|
||||
docs.append(line[space+1:])
|
||||
labels=np.asarray(labels,dtype=int)
|
||||
return docs,labels
|
||||
|
||||
|
||||
def mask_numbers(data, number_mask='numbermask'):
|
||||
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
masked.append(mask.sub(number_mask, text))
|
||||
return masked
|
||||
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
import os, sys
|
||||
from os.path import join
|
||||
import tarfile
|
||||
import xml.etree.ElementTree as ET
|
||||
from sklearn.datasets import get_data_home
|
||||
import pickle
|
||||
import rdflib
|
||||
from rdflib.namespace import RDF, SKOS
|
||||
from rdflib import URIRef
|
||||
import zipfile
|
||||
from collections import Counter
|
||||
from tqdm import tqdm
|
||||
from random import shuffle
|
||||
from util.file import *
|
||||
|
||||
|
||||
class JRCAcquis_Document:
|
||||
def __init__(self, id, name, lang, year, head, body, categories):
|
||||
self.id = id
|
||||
self.parallel_id = name
|
||||
self.lang = lang
|
||||
self.year = year
|
||||
self.text = body if not head else head + "\n" + body
|
||||
self.categories = categories
|
||||
|
||||
@classmethod
|
||||
def get_text(cls, jrc_documents):
|
||||
return [d.text for d in jrc_documents]
|
||||
|
||||
@classmethod
|
||||
def get_target(cls, jrc_documents):
|
||||
return [d.categories for d in jrc_documents]
|
||||
|
||||
|
||||
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
|
||||
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
|
||||
# standard codification), so it might be preferable not to read the header after all (as here by default)
|
||||
def _proc_acute(text):
|
||||
for ch in ['a','e','i','o','u']:
|
||||
text = text.replace('%'+ch+'acute%',ch)
|
||||
return text
|
||||
|
||||
def parse_document(file, year, head=False):
|
||||
root = ET.parse(file).getroot()
|
||||
|
||||
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
|
||||
doc_lang = root.attrib['lang'] # e.g., 'es'
|
||||
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
|
||||
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
|
||||
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
|
||||
|
||||
def raise_if_empty(field, from_file):
|
||||
if isinstance(field, str):
|
||||
if not field.strip():
|
||||
raise ValueError("Empty field in file %s" % from_file)
|
||||
|
||||
raise_if_empty(doc_name, file)
|
||||
raise_if_empty(doc_lang, file)
|
||||
raise_if_empty(doc_id, file)
|
||||
if head: raise_if_empty(doc_head, file)
|
||||
raise_if_empty(doc_body, file)
|
||||
|
||||
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
|
||||
|
||||
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
|
||||
def _filter_by_category(doclist, cat_filter):
|
||||
if not isinstance(cat_filter, frozenset):
|
||||
cat_filter = frozenset(cat_filter)
|
||||
filtered = []
|
||||
for doc in doclist:
|
||||
doc.categories = list(cat_filter & set(doc.categories))
|
||||
if doc.categories:
|
||||
doc.categories.sort()
|
||||
filtered.append(doc)
|
||||
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
|
||||
return filtered
|
||||
|
||||
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
|
||||
def _filter_by_frequency(doclist, cat_threshold):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
#select top most_frequent categories (and filters documents containing those categories)
|
||||
def _most_common(doclist, most_frequent):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
def _get_categories(request):
|
||||
final_cats = set()
|
||||
for d in request:
|
||||
final_cats.update(d.categories)
|
||||
return list(final_cats)
|
||||
|
||||
def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True,
|
||||
cat_filter=None, cat_threshold=0, most_frequent=-1,
|
||||
DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
|
||||
|
||||
if not data_path:
|
||||
data_path = get_data_home()
|
||||
|
||||
if not os.path.exists(data_path):
|
||||
os.mkdir(data_path)
|
||||
|
||||
request = []
|
||||
total_read = 0
|
||||
file_name = 'jrc-' + lang + '.tgz'
|
||||
archive_path = join(data_path, file_name)
|
||||
|
||||
if not os.path.exists(archive_path):
|
||||
print("downloading language-specific dataset (once and for all) into %s" % data_path)
|
||||
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
|
||||
download_file(DOWNLOAD_URL, archive_path)
|
||||
print("untarring dataset...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(data_path)
|
||||
|
||||
documents_dir = join(data_path, lang)
|
||||
|
||||
print("Reading documents...")
|
||||
read = 0
|
||||
for dir in list_dirs(documents_dir):
|
||||
year = int(dir)
|
||||
if years==None or year in years:
|
||||
year_dir = join(documents_dir,dir)
|
||||
l_y_documents = []
|
||||
all_documents = list_files(year_dir)
|
||||
empty = 0
|
||||
pbar = tqdm(enumerate(all_documents))
|
||||
for i,doc_file in pbar:
|
||||
try:
|
||||
jrc_doc = parse_document(join(year_dir, doc_file), year)
|
||||
except ValueError:
|
||||
jrc_doc = None
|
||||
|
||||
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
|
||||
l_y_documents.append(jrc_doc)
|
||||
else: empty += 1
|
||||
read+=1
|
||||
pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields')
|
||||
request += l_y_documents
|
||||
print("Read %d documents for language %s\n" % (read, lang))
|
||||
total_read += read
|
||||
|
||||
final_cats = _get_categories(request)
|
||||
|
||||
if cat_filter:
|
||||
request = _filter_by_category(request, cat_filter)
|
||||
final_cats = _get_categories(request)
|
||||
if cat_threshold > 0:
|
||||
request, final_cats = _filter_by_frequency(request, cat_threshold)
|
||||
if most_frequent != -1 and len(final_cats) > most_frequent:
|
||||
request, final_cats = _most_common(request, most_frequent)
|
||||
|
||||
return request, final_cats
|
||||
|
||||
def print_cat_analysis(request):
|
||||
cat_count = Counter()
|
||||
for d in request:
|
||||
cat_count.update(d.categories)
|
||||
print("Number of active categories: {}".format(len(cat_count)))
|
||||
print(cat_count.most_common())
|
||||
|
||||
# inspects the Eurovoc thesaurus in order to select a subset of categories
|
||||
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
|
||||
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
|
||||
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
|
||||
select="broadest"):
|
||||
|
||||
fullpath_pickle = join(data_path, select+'_concepts.pickle')
|
||||
if os.path.exists(fullpath_pickle):
|
||||
print("Pickled object found in %s. Loading it." % fullpath_pickle)
|
||||
return pickle.load(open(fullpath_pickle,'rb'))
|
||||
|
||||
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
|
||||
if not os.path.exists(fullpath):
|
||||
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
|
||||
download_file(eurovoc_url, fullpath)
|
||||
print("Unzipping file...")
|
||||
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
|
||||
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
|
||||
zipped.close()
|
||||
|
||||
print("Parsing %s" %fullpath)
|
||||
g = rdflib.Graph()
|
||||
g.parse(location=fullpath, format="application/rdf+xml")
|
||||
|
||||
if select == "all":
|
||||
print("Selecting all concepts")
|
||||
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
|
||||
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
|
||||
all_concepts.sort()
|
||||
selected_concepts = all_concepts
|
||||
elif select=="broadest":
|
||||
print("Selecting broadest concepts (those without any other broader concept linked to it)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
narrower_concepts = set(g.subjects(SKOS.broader, None))
|
||||
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
|
||||
broadest_concepts.sort()
|
||||
selected_concepts = broadest_concepts
|
||||
elif select=="leaves":
|
||||
print("Selecting leaves concepts (those not linked as broader of any other concept)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
broad_concepts = set(g.objects(None, SKOS.broader))
|
||||
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
|
||||
leave_concepts.sort()
|
||||
selected_concepts = leave_concepts
|
||||
else:
|
||||
raise ValueError("Selection policy %s is not currently supported" % select)
|
||||
|
||||
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
|
||||
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
|
||||
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return selected_concepts
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# example code
|
||||
|
||||
train_years = list(range(1986, 2006))
|
||||
test_years = [2006]
|
||||
cat_policy = 'all' #'leaves'
|
||||
most_common_cat = 300
|
||||
JRC_DATAPATH = "../datasets/JRC_Acquis_v3"
|
||||
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
|
||||
|
||||
training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years,
|
||||
cat_filter=None, cat_threshold=1,
|
||||
most_frequent=most_common_cat)
|
||||
test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years,
|
||||
cat_filter=tr_cats, cat_threshold=1)
|
||||
# training_cats = jrc_get_categories(training_docs)
|
||||
# test_cats = jrc_get_categories(test_docs)
|
||||
# intersection_cats = [c for c in training_cats if c in test_cats]
|
||||
|
||||
# training_docs = jrc_filter_by_category(training_docs, intersection_cats)
|
||||
# test_docs = jrc_filter_by_category(test_docs, intersection_cats)
|
||||
|
||||
|
||||
print(f'JRC-train: {len(training_docs)} documents')
|
||||
print(f'JRC-test: {len(test_docs)} documents')
|
||||
|
||||
print_cat_analysis(training_docs)
|
||||
print_cat_analysis(test_docs)
|
||||
|
||||
"""
|
||||
JRC-train: 12615 documents, 300 cats
|
||||
JRC-test: 7055 documents, 300 cats
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
class LabelledDocuments:
|
||||
def __init__(self, data, target, target_names):
|
||||
self.data=data
|
||||
self.target=target
|
||||
self.target_names=target_names
|
|
@ -0,0 +1,63 @@
|
|||
import os
|
||||
import pickle
|
||||
import tarfile
|
||||
from os.path import join
|
||||
import urllib.request
|
||||
from data.labeled import LabelledDocuments
|
||||
from util.file import create_if_not_exist, download_file_if_not_exists
|
||||
import math
|
||||
|
||||
|
||||
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7):
|
||||
_dataname = 'ohsumed50k'
|
||||
if data_path is None:
|
||||
data_path = join(os.path.expanduser('~'), _dataname)
|
||||
create_if_not_exist(data_path)
|
||||
|
||||
pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle')
|
||||
if not os.path.exists(pickle_file):
|
||||
DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz')
|
||||
archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz')
|
||||
download_file_if_not_exists(DOWNLOAD_URL, archive_path)
|
||||
untardir = 'ohsumed-all'
|
||||
if not os.path.exists(os.path.join(data_path, untardir)):
|
||||
print("untarring ohsumed...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(data_path)
|
||||
|
||||
target_names = []
|
||||
doc_classes = dict()
|
||||
class_docs = dict()
|
||||
content = dict()
|
||||
doc_ids = set()
|
||||
for cat_id in os.listdir(join(data_path, untardir)):
|
||||
target_names.append(cat_id)
|
||||
class_docs[cat_id] = []
|
||||
for doc_id in os.listdir(join(data_path, untardir, cat_id)):
|
||||
doc_ids.add(doc_id)
|
||||
text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read()
|
||||
if doc_id not in doc_classes: doc_classes[doc_id] = []
|
||||
doc_classes[doc_id].append(cat_id)
|
||||
if doc_id not in content: content[doc_id] = text_content
|
||||
class_docs[cat_id].append(doc_id)
|
||||
target_names.sort()
|
||||
print('Read %d different documents' % len(doc_ids))
|
||||
|
||||
splitdata = dict({'train': [], 'test': []})
|
||||
for cat_id in target_names:
|
||||
free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])]
|
||||
if len(free_docs) > 0:
|
||||
split_point = int(math.floor(len(free_docs) * train_test_split))
|
||||
splitdata['train'].extend(free_docs[:split_point])
|
||||
splitdata['test'].extend(free_docs[split_point:])
|
||||
for split in ['train', 'test']:
|
||||
dataset = LabelledDocuments([], [], target_names)
|
||||
for doc_id in splitdata[split]:
|
||||
dataset.data.append(content[doc_id])
|
||||
dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]])
|
||||
pickle.dump(dataset,
|
||||
open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'),
|
||||
protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
print(pickle_file)
|
||||
return pickle.load(open(pickle_file, 'rb'))
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
from zipfile import ZipFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from data.labeled import LabelledDocuments
|
||||
from util.file import list_files
|
||||
from os.path import join, exists
|
||||
from util.file import download_file_if_not_exists
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
|
||||
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
|
||||
|
||||
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
|
||||
'lyrl2004_tokens_test_pt1.dat.gz',
|
||||
'lyrl2004_tokens_test_pt2.dat.gz',
|
||||
'lyrl2004_tokens_test_pt3.dat.gz']
|
||||
|
||||
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
|
||||
|
||||
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
|
||||
|
||||
class RCV_Document:
|
||||
def __init__(self, id, text, categories, date=''):
|
||||
self.id = id
|
||||
self.date = date
|
||||
self.text = text
|
||||
self.categories = categories
|
||||
|
||||
class IDRangeException(Exception): pass
|
||||
|
||||
nwords = []
|
||||
|
||||
def parse_document(xml_content, valid_id_range=None):
|
||||
root = ET.fromstring(xml_content)
|
||||
|
||||
doc_id = root.attrib['itemid']
|
||||
if valid_id_range is not None:
|
||||
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
|
||||
raise IDRangeException
|
||||
|
||||
doc_categories = [cat.attrib['code'] for cat in
|
||||
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
|
||||
|
||||
doc_date = root.attrib['date']
|
||||
doc_title = root.find('.//title').text
|
||||
doc_headline = root.find('.//headline').text
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
|
||||
|
||||
if not doc_body:
|
||||
raise ValueError('Empty document')
|
||||
|
||||
if doc_title is None: doc_title = ''
|
||||
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
|
||||
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
|
||||
|
||||
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date)
|
||||
|
||||
|
||||
def fetch_RCV1(data_path, subset='all'):
|
||||
|
||||
assert subset in ['train', 'test', 'all'], 'split should either be "train", "test", or "all"'
|
||||
|
||||
request = []
|
||||
labels = set()
|
||||
read_documents = 0
|
||||
|
||||
training_documents = 23149
|
||||
test_documents = 781265
|
||||
|
||||
if subset == 'all':
|
||||
split_range = (2286, 810596)
|
||||
expected = training_documents+test_documents
|
||||
elif subset == 'train':
|
||||
split_range = (2286, 26150)
|
||||
expected = training_documents
|
||||
else:
|
||||
split_range = (26151, 810596)
|
||||
expected = test_documents
|
||||
|
||||
# global nwords
|
||||
# nwords=[]
|
||||
for part in list_files(data_path):
|
||||
if not re.match('\d+\.zip', part): continue
|
||||
target_file = join(data_path, part)
|
||||
assert exists(target_file), \
|
||||
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
|
||||
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
|
||||
zipfile = ZipFile(target_file)
|
||||
for xmlfile in zipfile.namelist():
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
try:
|
||||
doc = parse_document(xmlcontent, valid_id_range=split_range)
|
||||
labels.update(doc.categories)
|
||||
request.append(doc)
|
||||
read_documents += 1
|
||||
except (IDRangeException,ValueError) as e:
|
||||
pass
|
||||
print('\r[{}] read {} documents'.format(part, len(request)), end='')
|
||||
if read_documents == expected: break
|
||||
if read_documents == expected: break
|
||||
|
||||
print()
|
||||
# print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
|
||||
|
||||
return LabelledDocuments(data=[d.text for d in request], target=[d.categories for d in request], target_names=list(labels))
|
||||
|
||||
|
||||
|
||||
def fetch_topic_hierarchy(path, topics='all'):
|
||||
assert topics in ['all', 'leaves']
|
||||
|
||||
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
|
||||
hierarchy = {}
|
||||
for line in open(path, 'rt'):
|
||||
parts = line.strip().split()
|
||||
parent,child = parts[1],parts[3]
|
||||
if parent not in hierarchy:
|
||||
hierarchy[parent]=[]
|
||||
hierarchy[parent].append(child)
|
||||
|
||||
del hierarchy['None']
|
||||
del hierarchy['Root']
|
||||
print(hierarchy)
|
||||
|
||||
if topics=='all':
|
||||
topics = set(hierarchy.keys())
|
||||
for parent in hierarchy.keys():
|
||||
topics.update(hierarchy[parent])
|
||||
return list(topics)
|
||||
elif topics=='leaves':
|
||||
parents = set(hierarchy.keys())
|
||||
childs = set()
|
||||
for parent in hierarchy.keys():
|
||||
childs.update(hierarchy[parent])
|
||||
return list(childs.difference(parents))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
# example
|
||||
|
||||
RCV1_PATH = '../../datasets/RCV1-v2/unprocessed_corpus'
|
||||
|
||||
rcv1_train = fetch_RCV1(RCV1_PATH, subset='train')
|
||||
rcv1_test = fetch_RCV1(RCV1_PATH, subset='test')
|
||||
|
||||
print('read {} documents in rcv1-train, and {} labels'.format(len(rcv1_train.data), len(rcv1_train.target_names)))
|
||||
print('read {} documents in rcv1-test, and {} labels'.format(len(rcv1_test.data), len(rcv1_test.target_names)))
|
||||
|
||||
cats = Counter()
|
||||
for cats in rcv1_train.target: cats.update(cats)
|
||||
print('RCV1', cats)
|
|
@ -0,0 +1,189 @@
|
|||
# Modified version of the code originally implemented by Eustache Diemert <eustache@diemert.fr>
|
||||
# @FedericoV <https://github.com/FedericoV/>
|
||||
# with License: BSD 3 clause
|
||||
|
||||
import os.path
|
||||
import re
|
||||
import tarfile
|
||||
from sklearn.datasets import get_data_home
|
||||
from six.moves import html_parser
|
||||
from six.moves import urllib
|
||||
import pickle
|
||||
from glob import glob
|
||||
import numpy as np
|
||||
from data.labeled import LabelledDocuments
|
||||
|
||||
|
||||
def _not_in_sphinx():
|
||||
# Hack to detect whether we are running by the sphinx builder
|
||||
return '__file__' in globals()
|
||||
|
||||
|
||||
class ReutersParser(html_parser.HTMLParser):
|
||||
"""Utility class to parse a SGML file and yield documents one at a time."""
|
||||
|
||||
def __init__(self, encoding='latin-1', data_path=None):
|
||||
self.data_path = data_path
|
||||
self.download_if_not_exist()
|
||||
self.tr_docs = []
|
||||
self.te_docs = []
|
||||
html_parser.HTMLParser.__init__(self)
|
||||
self._reset()
|
||||
self.encoding = encoding
|
||||
self.empty_docs = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
method = 'start_' + tag
|
||||
getattr(self, method, lambda x: None)(attrs)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
method = 'end_' + tag
|
||||
getattr(self, method, lambda: None)()
|
||||
|
||||
def _reset(self):
|
||||
self.in_title = 0
|
||||
self.in_body = 0
|
||||
self.in_topics = 0
|
||||
self.in_topic_d = 0
|
||||
self.in_unproc_text = 0
|
||||
self.title = ""
|
||||
self.body = ""
|
||||
self.topics = []
|
||||
self.topic_d = ""
|
||||
self.text = ""
|
||||
|
||||
def parse(self, fd):
|
||||
for chunk in fd:
|
||||
self.feed(chunk.decode(self.encoding))
|
||||
self.close()
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_body:
|
||||
self.body += data
|
||||
elif self.in_title:
|
||||
self.title += data
|
||||
elif self.in_topic_d:
|
||||
self.topic_d += data
|
||||
elif self.in_unproc_text:
|
||||
self.text += data
|
||||
|
||||
def start_reuters(self, attributes):
|
||||
topic_attr = attributes[0][1]
|
||||
lewissplit_attr = attributes[1][1]
|
||||
self.lewissplit = u'unused'
|
||||
if topic_attr==u'YES':
|
||||
if lewissplit_attr == u'TRAIN':
|
||||
self.lewissplit = 'train'
|
||||
elif lewissplit_attr == u'TEST':
|
||||
self.lewissplit = 'test'
|
||||
pass
|
||||
|
||||
def end_reuters(self):
|
||||
self.body = re.sub(r'\s+', r' ', self.body)
|
||||
if self.lewissplit != u'unused':
|
||||
parsed_doc = {'title': self.title, 'body': self.body, 'unproc':self.text, 'topics': self.topics}
|
||||
if (self.title+self.body+self.text).strip() == '':
|
||||
self.empty_docs += 1
|
||||
if self.lewissplit == u'train':
|
||||
self.tr_docs.append(parsed_doc)
|
||||
elif self.lewissplit == u'test':
|
||||
self.te_docs.append(parsed_doc)
|
||||
self._reset()
|
||||
|
||||
def start_title(self, attributes):
|
||||
self.in_title = 1
|
||||
|
||||
def end_title(self):
|
||||
self.in_title = 0
|
||||
|
||||
def start_body(self, attributes):
|
||||
self.in_body = 1
|
||||
|
||||
def end_body(self):
|
||||
self.in_body = 0
|
||||
|
||||
def start_topics(self, attributes):
|
||||
self.in_topics = 1
|
||||
|
||||
def end_topics(self):
|
||||
self.in_topics = 0
|
||||
|
||||
def start_text(self, attributes):
|
||||
if len(attributes)>0 and attributes[0][1] == u'UNPROC':
|
||||
self.in_unproc_text = 1
|
||||
|
||||
def end_text(self):
|
||||
self.in_unproc_text = 0
|
||||
|
||||
def start_d(self, attributes):
|
||||
self.in_topic_d = 1
|
||||
|
||||
def end_d(self):
|
||||
if self.in_topics:
|
||||
self.topics.append(self.topic_d)
|
||||
self.in_topic_d = 0
|
||||
self.topic_d = ""
|
||||
|
||||
def download_if_not_exist(self):
|
||||
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
|
||||
'reuters21578-mld/reuters21578.tar.gz')
|
||||
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
|
||||
|
||||
if self.data_path is None:
|
||||
self.data_path = os.path.join(get_data_home(), "reuters")
|
||||
if not os.path.exists(self.data_path):
|
||||
"""Download the dataset."""
|
||||
print("downloading dataset (once and for all) into %s" % self.data_path)
|
||||
os.mkdir(self.data_path)
|
||||
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
if _not_in_sphinx():
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
|
||||
archive_path = os.path.join(self.data_path, ARCHIVE_FILENAME)
|
||||
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
|
||||
reporthook=progress)
|
||||
if _not_in_sphinx():
|
||||
print('\r', end='')
|
||||
print("untarring Reuters dataset...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(self.data_path)
|
||||
print("done.")
|
||||
|
||||
|
||||
def fetch_reuters21578(data_path=None, subset='train'):
|
||||
if data_path is None:
|
||||
data_path = os.path.join(get_data_home(), 'reuters21578')
|
||||
reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle")
|
||||
if not os.path.exists(reuters_pickle_path):
|
||||
parser = ReutersParser(data_path=data_path)
|
||||
for filename in glob(os.path.join(data_path, "*.sgm")):
|
||||
parser.parse(open(filename, 'rb'))
|
||||
# index category names with a unique numerical code (only considering categories with training examples)
|
||||
tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist()
|
||||
|
||||
def pickle_documents(docs, subset):
|
||||
for doc in docs:
|
||||
doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories]
|
||||
pickle_docs = {'categories': tr_categories, 'documents': docs}
|
||||
pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'),
|
||||
protocol=pickle.HIGHEST_PROTOCOL)
|
||||
return pickle_docs
|
||||
|
||||
pickle_tr = pickle_documents(parser.tr_docs, "train")
|
||||
pickle_te = pickle_documents(parser.te_docs, "test")
|
||||
# self.sout('Empty docs %d' % parser.empty_docs)
|
||||
requested_subset = pickle_tr if subset == 'train' else pickle_te
|
||||
else:
|
||||
requested_subset = pickle.load(open(reuters_pickle_path, 'rb'))
|
||||
|
||||
data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']]
|
||||
text_data, topics = zip(*data)
|
||||
return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories'])
|
||||
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
reuters_train = fetch_reuters21578(subset='train')
|
||||
print(reuters_train.data)
|
|
@ -0,0 +1,280 @@
|
|||
import math
|
||||
import numpy as np
|
||||
from scipy.stats import t
|
||||
from scipy.stats import norm
|
||||
from joblib import Parallel, delayed
|
||||
import time
|
||||
from scipy.sparse import csr_matrix, csc_matrix
|
||||
|
||||
|
||||
STWFUNCTIONS = ['dotn', 'ppmi', 'ig', 'chi2', 'cw', 'wp']
|
||||
|
||||
|
||||
def get_probs(tpr, fpr, pc):
|
||||
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
|
||||
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
|
||||
pnc = 1.0 - pc
|
||||
tp = tpr * pc
|
||||
fn = pc - tp
|
||||
fp = fpr * pnc
|
||||
tn = pnc - fp
|
||||
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
|
||||
|
||||
|
||||
def apply_tsr(tpr, fpr, pc, tsr):
|
||||
cell = get_probs(tpr, fpr, pc)
|
||||
return tsr(cell)
|
||||
|
||||
|
||||
def positive_information_gain(cell):
|
||||
if cell.tpr() < cell.fpr():
|
||||
return 0.0
|
||||
else:
|
||||
return information_gain(cell)
|
||||
|
||||
|
||||
def posneg_information_gain(cell):
|
||||
ig = information_gain(cell)
|
||||
if cell.tpr() < cell.fpr():
|
||||
return -ig
|
||||
else:
|
||||
return ig
|
||||
|
||||
|
||||
def __ig_factor(p_tc, p_t, p_c):
|
||||
den = p_t * p_c
|
||||
if den != 0.0 and p_tc != 0:
|
||||
return p_tc * math.log(p_tc / den, 2)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def information_gain(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
|
||||
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
|
||||
|
||||
|
||||
def information_gain_mod(cell):
|
||||
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
|
||||
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
|
||||
|
||||
|
||||
def pointwise_mutual_information(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
|
||||
|
||||
|
||||
def gain_ratio(cell):
|
||||
pc = cell.p_c()
|
||||
pnc = 1.0 - pc
|
||||
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
|
||||
return information_gain(cell) / (-norm)
|
||||
|
||||
|
||||
def chi_square(cell):
|
||||
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
|
||||
if den==0.0: return 0.0
|
||||
num = gss(cell)**2
|
||||
return num / den
|
||||
|
||||
|
||||
def relevance_frequency(cell):
|
||||
a = cell.tp
|
||||
c = cell.fp
|
||||
if c == 0: c = 1
|
||||
return math.log(2.0 + (a * 1.0 / c), 2)
|
||||
|
||||
|
||||
def idf(cell):
|
||||
if cell.p_f()>0:
|
||||
return math.log(1.0 / cell.p_f())
|
||||
return 0.0
|
||||
|
||||
|
||||
def gss(cell):
|
||||
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
|
||||
|
||||
|
||||
def conf_interval(xt, n):
|
||||
if n>30:
|
||||
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
|
||||
else:
|
||||
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
|
||||
p = (xt + 0.5 * z2) / (n + z2)
|
||||
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
|
||||
return p, amplitude
|
||||
|
||||
|
||||
def strength(minPosRelFreq, minPos, maxNeg):
|
||||
if minPos > maxNeg:
|
||||
return math.log(2.0 * minPosRelFreq, 2.0)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
|
||||
#however, for some extremely imbalanced dataset caused all documents to be 0
|
||||
def conf_weight(cell, cancel_features=False):
|
||||
c = cell.get_c()
|
||||
not_c = cell.get_not_c()
|
||||
tp = cell.tp
|
||||
fp = cell.fp
|
||||
|
||||
pos_p, pos_amp = conf_interval(tp, c)
|
||||
neg_p, neg_amp = conf_interval(fp, not_c)
|
||||
|
||||
min_pos = pos_p-pos_amp
|
||||
max_neg = neg_p+neg_amp
|
||||
den = (min_pos + max_neg)
|
||||
minpos_relfreq = min_pos / (den if den != 0 else 1)
|
||||
|
||||
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
|
||||
|
||||
if str_tplus == 0 and not cancel_features:
|
||||
return 1e-20
|
||||
|
||||
return str_tplus;
|
||||
|
||||
|
||||
def word_prob(cell):
|
||||
return cell.tpr()
|
||||
|
||||
|
||||
class ContTable:
|
||||
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
|
||||
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
|
||||
print(f'[selectiong {k} terms]')
|
||||
nC = Y.shape[1]
|
||||
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
|
||||
best_features_idx = np.argsort(-FC, axis=0).flatten()
|
||||
tsr_values = FC.flatten()
|
||||
selected_indexes_set = set()
|
||||
selected_indexes = list()
|
||||
selected_value = list()
|
||||
from_category = list()
|
||||
round_robin = iter(best_features_idx)
|
||||
values_iter = iter(tsr_values)
|
||||
round=0
|
||||
while len(selected_indexes) < k:
|
||||
term_idx = next(round_robin)
|
||||
term_val = next(values_iter)
|
||||
if term_idx not in selected_indexes_set:
|
||||
selected_indexes_set.add(term_idx)
|
||||
selected_indexes.append(term_idx)
|
||||
selected_value.append(term_val)
|
||||
from_category.append(round)
|
||||
round = (round + 1) % nC
|
||||
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
|
||||
|
||||
|
||||
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
|
||||
tp_ = len(positive_document_indexes & feature_document_indexes)
|
||||
fp_ = len(feature_document_indexes - positive_document_indexes)
|
||||
fn_ = len(positive_document_indexes - feature_document_indexes)
|
||||
tn_ = nD - (tp_ + fp_ + fn_)
|
||||
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
|
||||
|
||||
|
||||
def category_tables(feature_sets, category_sets, c, nD, nF):
|
||||
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
|
||||
|
||||
|
||||
"""
|
||||
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
|
||||
Efficiency O(nF x nC x log(S)) where S is the sparse factor
|
||||
"""
|
||||
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
|
||||
nD, nF = coocurrence_matrix.shape
|
||||
nD2, nC = label_matrix.shape
|
||||
|
||||
if nD != nD2:
|
||||
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
|
||||
(coocurrence_matrix.shape,label_matrix.shape))
|
||||
|
||||
def nonzero_set(matrix, col):
|
||||
return set(matrix[:, col].nonzero()[0])
|
||||
|
||||
if isinstance(coocurrence_matrix, csr_matrix):
|
||||
coocurrence_matrix = csc_matrix(coocurrence_matrix)
|
||||
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
|
||||
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
|
||||
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
|
||||
return np.array(cell_matrix)
|
||||
|
||||
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
|
||||
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
|
||||
nC,nF = cell_matrix.shape
|
||||
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
|
||||
return np.array(tsr_matrix)
|
||||
|
||||
|
||||
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
|
||||
take as input any real-valued feature column (e.g., tf-idf weights).
|
||||
feat is the feature vector, and c is a binary classification vector.
|
||||
This implementation covers only the binary case, while the formula is defined for multiclass
|
||||
single-label scenarios, for which the version [2] might be preferred.
|
||||
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
|
||||
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
|
||||
"""
|
||||
def fisher_score_binary(feat, c):
|
||||
neg = np.ones_like(c) - c
|
||||
|
||||
npos = np.sum(c)
|
||||
nneg = np.sum(neg)
|
||||
|
||||
mupos = np.mean(feat[c == 1])
|
||||
muneg = np.mean(feat[neg == 1])
|
||||
mu = np.mean(feat)
|
||||
|
||||
stdpos = np.std(feat[c == 1])
|
||||
stdneg = np.std(feat[neg == 1])
|
||||
|
||||
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
|
||||
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
|
||||
|
||||
if den>0:
|
||||
return num / den
|
||||
else:
|
||||
return num
|
|
@ -0,0 +1,212 @@
|
|||
#https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/
|
||||
import os, sys
|
||||
from os.path import exists, join
|
||||
from util.file import *
|
||||
from zipfile import ZipFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import pickle
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
WIPO_URL= 'https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/'
|
||||
|
||||
|
||||
class WipoGammaDocument:
|
||||
def __init__(self, id, text, main_label, all_labels):
|
||||
self.id = id
|
||||
self.text = text
|
||||
self.main_label = main_label
|
||||
self.all_labels = all_labels
|
||||
|
||||
|
||||
def remove_nested_claimtext_tags(xmlcontent):
|
||||
from_pos = xmlcontent.find(b'<claims')
|
||||
to_pos = xmlcontent.find(b'</claims>')
|
||||
if from_pos > -1 and to_pos > -1:
|
||||
in_between = xmlcontent[from_pos:to_pos].replace(b'<claim-text>',b'').replace(b'</claim-text>',b'')
|
||||
xmlcontent = (xmlcontent[:from_pos]+in_between+xmlcontent[to_pos:]).strip()
|
||||
return xmlcontent
|
||||
|
||||
|
||||
def parse_document(xml_content, text_fields, limit_description):
|
||||
root = ET.fromstring(remove_nested_claimtext_tags(xml_content))
|
||||
|
||||
doc_id = root.attrib['ucid']
|
||||
lang = root.attrib['lang']
|
||||
|
||||
#take categories from the categorization up the "sub-class" level
|
||||
main_group = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="true"]'))
|
||||
sec_groups = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="false"]'))
|
||||
sec_groups.update(main_group)
|
||||
|
||||
assert len(main_group) == 1, 'more than one main groups'
|
||||
main_group = list(main_group)[0]
|
||||
sec_groups = sorted(list(sec_groups))
|
||||
|
||||
assert lang == 'EN', f'only English documents allowed (doc {doc_id})'
|
||||
|
||||
doc_text_fields=[]
|
||||
if 'abstract' in text_fields:
|
||||
abstract = '\n'.join(filter(None, [t.text for t in root.findall('.//abstract[@lang="EN"]/p')]))
|
||||
doc_text_fields.append(abstract)
|
||||
if 'description' in text_fields:
|
||||
description = '\n'.join(filter(None, [t.text for t in root.findall('.//description[@lang="EN"]/p')]))
|
||||
if limit_description>-1:
|
||||
description=' '.join(description.split()[:limit_description])
|
||||
doc_text_fields.append(description)
|
||||
if 'claims' in text_fields:
|
||||
claims = '\n'.join(filter(None, [t.text for t in root.findall('.//claims[@lang="EN"]/claim')]))
|
||||
doc_text_fields.append(claims)
|
||||
|
||||
text = '\n'.join(doc_text_fields)
|
||||
if text:
|
||||
return WipoGammaDocument(doc_id, text, main_group, sec_groups)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def extract(fin, fout, text_fields, limit_description):
|
||||
zipfile = ZipFile(fin)
|
||||
ndocs=0
|
||||
with open(fout, 'wt') as out:
|
||||
for xmlfile in tqdm(zipfile.namelist()):
|
||||
if xmlfile.endswith('.xml'):
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
document = parse_document(xmlcontent, text_fields, limit_description)
|
||||
if document:
|
||||
line_text = document.text.replace('\n', ' ').replace('\t', ' ').strip()
|
||||
assert line_text, f'empty document in {xmlfile}'
|
||||
all_labels = ' '.join(document.all_labels)
|
||||
out.write('\t'.join([document.id, document.main_label, all_labels, line_text]))
|
||||
out.write('\n')
|
||||
ndocs+=1
|
||||
out.flush()
|
||||
|
||||
|
||||
|
||||
def read_classification_file(data_path, classification_level):
|
||||
assert classification_level in ['subclass', 'maingroup'], 'wrong classification requested'
|
||||
z = ZipFile(join(data_path,'EnglishWipoGamma1.zip'))
|
||||
inpath='Wipo_Gamma/English/TrainTestSpits'
|
||||
document_labels = dict()
|
||||
train_ids, test_ids = set(), set()
|
||||
labelcut = LabelCut(classification_level)
|
||||
for subset in tqdm(['train', 'test'], desc='loading classification file'):
|
||||
target_subset = train_ids if subset=='train' else test_ids
|
||||
if classification_level == 'subclass':
|
||||
file = f'{subset}set_en_sc.parts' #sub-class level
|
||||
else:
|
||||
file = f'{subset}set_en_mg.parts' #main-group level
|
||||
|
||||
for line in z.open(f'{inpath}/{file}').readlines():
|
||||
line = line.decode().strip().split(',')
|
||||
id = line[0]
|
||||
id = id[id.rfind('/')+1:].replace('.xml','')
|
||||
labels = labelcut.trim(line[1:])
|
||||
document_labels[id]=labels
|
||||
target_subset.add(id)
|
||||
|
||||
return document_labels, train_ids, test_ids
|
||||
|
||||
|
||||
class LabelCut:
|
||||
"""
|
||||
Labels consists of 1 char for section, 2 chars for class, 1 class for subclass, 2 chars for maingroup and so on.
|
||||
This class cuts the label at a desired level (4 for subclass, or 6 for maingroup)
|
||||
"""
|
||||
def __init__(self, classification_level):
|
||||
assert classification_level in {'subclass','maingroup'}, 'unknown classification level'
|
||||
if classification_level == 'subclass': self.cut = 4
|
||||
else: self.cut = 6
|
||||
|
||||
def trim(self, label):
|
||||
if isinstance(label, list):
|
||||
return sorted(set([l[:self.cut] for l in label]))
|
||||
else:
|
||||
return label[:self.cut]
|
||||
|
||||
|
||||
|
||||
def fetch_WIPOgamma(subset, classification_level, data_home, extracted_path, text_fields = ['abstract', 'description'], limit_description=300):
|
||||
"""
|
||||
Fetchs the WIPO-gamma dataset
|
||||
:param subset: 'train' or 'test' split
|
||||
:param classification_level: the classification level, either 'subclass' or 'maingroup'
|
||||
:param data_home: directory containing the original 11 English zips
|
||||
:param extracted_path: directory used to extract and process the original files
|
||||
:param text_fields: indicates the fields to extract, in 'abstract', 'description', 'claims'
|
||||
:param limit_description: the maximum number of words to take from the description field (default 300); set to -1 for all
|
||||
:return:
|
||||
"""
|
||||
assert subset in {"train", "test"}, 'unknown target request (valid ones are "train" or "test")'
|
||||
assert len(text_fields)>0, 'at least some text field should be indicated'
|
||||
if not exists(data_home):
|
||||
raise ValueError(f'{data_home} does not exist, and the dataset cannot be automatically download, '
|
||||
f'since you need to request for permission. Please refer to {WIPO_URL}')
|
||||
|
||||
create_if_not_exist(extracted_path)
|
||||
config = f'{"-".join(text_fields)}'
|
||||
if 'description' in text_fields: config+='-{limit_description}'
|
||||
pickle_path=join(extracted_path, f'wipo-{subset}-{classification_level}-{config}.pickle')
|
||||
if exists(pickle_path):
|
||||
print(f'loading pickled file in {pickle_path}')
|
||||
return pickle.load(open(pickle_path,'rb'))
|
||||
|
||||
print('pickle file not found, processing...(this will take some minutes)')
|
||||
extracted = sum([exists(f'{extracted_path}/EnglishWipoGamma{(i+1)}-{config}.txt') for i in range(11)])==11
|
||||
if not extracted:
|
||||
print(f'extraction files not found, extracting files in {data_home}... (this will take some additional minutes)')
|
||||
Parallel(n_jobs=-1)(
|
||||
delayed(extract)(
|
||||
join(data_home, file), join(extracted_path, file.replace('.zip', f'-{config}.txt')), text_fields, limit_description
|
||||
)
|
||||
for file in list_files(data_home)
|
||||
)
|
||||
doc_labels, train_ids, test_ids = read_classification_file(data_home, classification_level=classification_level) # or maingroup
|
||||
print(f'{len(doc_labels)} documents classified split in {len(train_ids)} train and {len(test_ids)} test documents')
|
||||
|
||||
train_request = []
|
||||
test_request = []
|
||||
pbar = tqdm([filename for filename in list_files(extracted_path) if filename.endswith(f'-{config}.txt')])
|
||||
labelcut = LabelCut(classification_level)
|
||||
errors=0
|
||||
for proc_file in pbar:
|
||||
pbar.set_description(f'processing {proc_file} [errors={errors}]')
|
||||
if not proc_file.endswith(f'-{config}.txt'): continue
|
||||
lines = open(f'{extracted_path}/{proc_file}', 'rt').readlines()
|
||||
for lineno,line in enumerate(lines):
|
||||
parts = line.split('\t')
|
||||
assert len(parts)==4, f'wrong format in {extracted_path}/{proc_file} line {lineno}'
|
||||
id,mainlabel,alllabels,text=parts
|
||||
mainlabel = labelcut.trim(mainlabel)
|
||||
alllabels = labelcut.trim(alllabels.split())
|
||||
|
||||
# assert id in train_ids or id in test_ids, f'id {id} out of scope'
|
||||
if id not in train_ids and id not in test_ids:
|
||||
errors+=1
|
||||
else:
|
||||
# assert mainlabel == doc_labels[id][0], 'main label not consistent'
|
||||
request = train_request if id in train_ids else test_request
|
||||
request.append(WipoGammaDocument(id, text, mainlabel, alllabels))
|
||||
|
||||
print('pickling requests for faster subsequent runs')
|
||||
pickle.dump(train_request, open(join(extracted_path,f'wipo-train-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
|
||||
pickle.dump(test_request, open(join(extracted_path, f'wipo-test-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
|
||||
|
||||
if subset== 'train':
|
||||
return train_request
|
||||
else:
|
||||
return test_request
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
data_home = '../../datasets/WIPO/wipo-gamma/en'
|
||||
extracted_path = '../../datasets/WIPO-extracted'
|
||||
|
||||
train = fetch_WIPOgamma(subset='train', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
|
||||
test = fetch_WIPOgamma(subset='test', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
|
||||
# train = fetch_WIPOgamma(subset='train', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
|
||||
# test = fetch_WIPOgamma(subset='test', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
|
||||
|
||||
print('Done')
|
|
@ -0,0 +1,118 @@
|
|||
import argparse
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import itertools
|
||||
from sklearn.multioutput import ClassifierChain
|
||||
from tqdm import tqdm
|
||||
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||
from scipy.sparse import csr_matrix
|
||||
import quapy as qp
|
||||
from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
|
||||
from MultiLabel.mlclassification import MLStackedClassifier
|
||||
from MultiLabel.mldata import MultilabelledCollection
|
||||
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||
MLACC, \
|
||||
MLPACC, MLNaiveAggregativeQuantifier
|
||||
from MultiLabel.tabular import Table
|
||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||
import numpy as np
|
||||
from data.dataset import Dataset
|
||||
from mlevaluation import ml_natural_prevalence_prediction, ml_artificial_prevalence_prediction, check_error_str
|
||||
import sys
|
||||
import os
|
||||
import pickle
|
||||
|
||||
models = [#'MLPE',
|
||||
'NaiveCC', 'NaivePCC', 'NaivePCCcal', 'NaiveACC', 'NaivePACC', 'NaivePACCcal', 'NaiveACCit', 'NaivePACCit',
|
||||
#'NaiveHDy', 'NaiveSLD',
|
||||
'ChainCC', 'ChainPCC', 'ChainACC', 'ChainPACC',
|
||||
'StackCC', 'StackPCC', 'StackPCCcal', 'StackACC', 'StackPACC', 'StackPACCcal', 'StackACCit', 'StackP'
|
||||
'ACCit',
|
||||
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC', 'MRQ-ACCit', 'MRQ-PACCit',
|
||||
'StackMRQ-CC', 'StackMRQ-PCC', 'StackMRQ-ACC', 'StackMRQ-PACC',
|
||||
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
|
||||
'StackMRQ-StackCC', 'StackMRQ-StackPCC', 'StackMRQ-StackACC', 'StackMRQ-StackPACC',
|
||||
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
|
||||
'StackMRQ-StackCC-app', 'StackMRQ-StackPCC-app', 'StackMRQ-StackACC-app', 'StackMRQ-StackPACC-app',
|
||||
'LSP-CC', 'LSP-ACC', 'MLKNN-CC', 'MLKNN-ACC',
|
||||
'MLAdjustedC', 'MLStackAdjustedC', 'MLprobAdjustedC', 'MLStackProbAdjustedC'
|
||||
]
|
||||
|
||||
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||
datasets = TC_DATASETS
|
||||
|
||||
|
||||
|
||||
|
||||
def generate_table(path, protocol, error):
|
||||
|
||||
def compute_score_job(args):
|
||||
dataset, model = args
|
||||
result_path = f'{opt.results}/{dataset}_{model}.pkl'
|
||||
if os.path.exists(result_path):
|
||||
print('+', end='')
|
||||
sys.stdout.flush()
|
||||
result = load_results(result_path)
|
||||
true_prevs, estim_prevs = result[protocol]
|
||||
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
|
||||
return dataset, model, scores
|
||||
print('-', end='')
|
||||
sys.stdout.flush()
|
||||
return None
|
||||
|
||||
|
||||
print(f'\ngenerating {path}')
|
||||
table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
|
||||
results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
|
||||
print()
|
||||
|
||||
for r in results:
|
||||
if r is not None:
|
||||
dataset, model, scores = r
|
||||
table.add(dataset, model, scores)
|
||||
|
||||
save_table(table, path)
|
||||
save_table(table.getRankTable(), path.replace('.tex','.rank.tex'))
|
||||
|
||||
|
||||
|
||||
def save_table(table, path):
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
|
||||
"""
|
||||
dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
|
||||
method_replace = {}
|
||||
|
||||
tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}
|
||||
"""
|
||||
with open(path, 'wt') as foo:
|
||||
foo.write(tabular)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Experiments for multi-label quantification')
|
||||
parser.add_argument('--results', type=str, default='./results', metavar='str',
|
||||
help=f'path where to store the results')
|
||||
parser.add_argument('--tablepath', type=str, default='./tables', metavar='str',
|
||||
help=f'path where to store the tables')
|
||||
opt = parser.parse_args()
|
||||
|
||||
assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
|
||||
os.makedirs(opt.tablepath, exist_ok=True)
|
||||
|
||||
qp.environ["SAMPLE_SIZE"] = sample_size
|
||||
absolute_error = qp.error.ae
|
||||
relative_absolute_error = qp.error.rae
|
||||
|
||||
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
|
||||
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
|
||||
generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
|
||||
generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,290 @@
|
|||
import argparse
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import itertools
|
||||
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import ClassifierChain
|
||||
from tqdm import tqdm
|
||||
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||
from scipy.sparse import csr_matrix
|
||||
import quapy as qp
|
||||
from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
|
||||
from MultiLabel.mldata import MultilabelledCollection
|
||||
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||
MLACC, \
|
||||
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE, StackMLRQuantifier, MLadjustedCount, MLprobAdjustedCount
|
||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||
import numpy as np
|
||||
from data.dataset import Dataset
|
||||
from mlevaluation import ml_natural_prevalence_prediction, ml_artificial_prevalence_prediction
|
||||
import sys
|
||||
import os
|
||||
import pickle
|
||||
|
||||
|
||||
def cls():
|
||||
# return LinearSVC()
|
||||
return LogisticRegression(max_iter=1000, solver='lbfgs')
|
||||
|
||||
|
||||
def calibratedCls():
|
||||
return CalibratedClassifierCV(cls())
|
||||
|
||||
# DEBUG=True
|
||||
|
||||
# if DEBUG:
|
||||
sample_size = 100
|
||||
n_samples = 5000
|
||||
|
||||
SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||
SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
|
||||
TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
|
||||
|
||||
DATASETS = TC_DATASETS
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def models():
|
||||
yield 'MLPE', MLMLPE()
|
||||
yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
|
||||
yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
|
||||
yield 'NaivePCCcal', MLNaiveAggregativeQuantifier(PCC(calibratedCls()))
|
||||
yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
|
||||
yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
|
||||
yield 'NaivePACCcal', MLNaiveAggregativeQuantifier(PACC(calibratedCls()))
|
||||
yield 'NaiveACCit', MLNaiveAggregativeQuantifier(ACC(cls()))
|
||||
yield 'NaivePACCit', MLNaiveAggregativeQuantifier(PACC(cls()))
|
||||
# yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
|
||||
# yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
|
||||
yield 'StackCC', MLCC(MLStackedClassifier(cls()))
|
||||
yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
|
||||
yield 'StackPCCcal', MLPCC(MLStackedClassifier(calibratedCls()))
|
||||
yield 'StackACC', MLACC(MLStackedClassifier(cls()))
|
||||
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
|
||||
yield 'StackPACCcal', MLPACC(MLStackedClassifier(calibratedCls()))
|
||||
yield 'StackACCit', MLACC(MLStackedClassifier(cls()))
|
||||
yield 'StackPACCit', MLPACC(MLStackedClassifier(cls()))
|
||||
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None))
|
||||
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None))
|
||||
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None))
|
||||
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None))
|
||||
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
||||
yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
|
||||
yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
|
||||
yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
|
||||
yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
|
||||
yield 'MRQ-ACCit', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
|
||||
yield 'MRQ-PACCit', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
|
||||
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'StackMRQ-CC', StackMLRQuantifier(MLNaiveQuantifier(CC(cls())), **common)
|
||||
yield 'StackMRQ-PCC', StackMLRQuantifier(MLNaiveQuantifier(PCC(cls())), **common)
|
||||
yield 'StackMRQ-ACC', StackMLRQuantifier(MLNaiveQuantifier(ACC(cls())), **common)
|
||||
yield 'StackMRQ-PACC', StackMLRQuantifier(MLNaiveQuantifier(PACC(cls())), **common)
|
||||
yield 'StackMRQ-StackCC', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), **common)
|
||||
yield 'StackMRQ-StackPCC', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), **common)
|
||||
yield 'StackMRQ-StackACC', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), **common)
|
||||
yield 'StackMRQ-StackPACC', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), **common)
|
||||
yield 'StackMRQ-StackCC-app', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'StackMRQ-StackPCC-app', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'StackMRQ-StackACC-app', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'StackMRQ-StackPACC-app', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MLAdjustedC', MLadjustedCount(OneVsRestClassifier(cls()))
|
||||
yield 'MLStackAdjustedC', MLadjustedCount(MLStackedClassifier(cls()))
|
||||
# yield 'MLprobAdjustedC', MLprobAdjustedCount(OneVsRestClassifier(calibratedCls()))
|
||||
# yield 'MLStackProbAdjustedC', MLprobAdjustedCount(MLStackedClassifier(calibratedCls()))
|
||||
|
||||
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
|
||||
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
|
||||
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
|
||||
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
|
||||
# yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
|
||||
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
|
||||
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
|
||||
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
|
||||
# yield 'MLKNN-CC', MLCC(MLknn())
|
||||
#yield 'MLKNN-PCC', MLPCC(MLknn())
|
||||
# yield 'MLKNN-ACC', MLACC(MLknn())
|
||||
#yield 'MLKNN-PACC', MLPACC(MLknn())
|
||||
|
||||
|
||||
def get_dataset(dataset_name, dopickle=True):
|
||||
datadir = f'{qp.util.get_quapy_home()}/pickles'
|
||||
datapath = f'{datadir}/{dataset_name}.pkl'
|
||||
if dopickle:
|
||||
if os.path.exists(datapath):
|
||||
print(f'returning pickled object in {datapath}')
|
||||
return pickle.load(open(datapath, 'rb'))
|
||||
|
||||
if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
|
||||
clean_name = dataset_name.replace('-red','')
|
||||
Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
|
||||
Xte, yte, _, _ = load_dataset(clean_name, 'test')
|
||||
print(f'n-labels = {len(label_names)}')
|
||||
|
||||
Xtr = csr_matrix(Xtr)
|
||||
Xte = csr_matrix(Xte)
|
||||
|
||||
ytr = ytr.todense().getA()
|
||||
yte = yte.todense().getA()
|
||||
|
||||
if dataset_name.endswith('-red'):
|
||||
TO_SELECT = 10
|
||||
nC = ytr.shape[1]
|
||||
tr_counts = ytr.sum(axis=0)
|
||||
te_counts = yte.sum(axis=0)
|
||||
if nC > TO_SELECT:
|
||||
Y = ytr.T.dot(ytr) # class-class coincidence matrix
|
||||
Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal
|
||||
order_ij = np.argsort(-Y, axis=None)
|
||||
selected = set()
|
||||
p=0
|
||||
while len(selected) < TO_SELECT:
|
||||
highest_index = order_ij[p]
|
||||
class_i = highest_index // nC
|
||||
class_j = highest_index % nC
|
||||
# if there is only one class to go, then add the most populated one
|
||||
most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
|
||||
if te_counts[most_populated]>0:
|
||||
selected.add(most_populated)
|
||||
if len(selected) < TO_SELECT:
|
||||
if te_counts[least_populated]>0:
|
||||
selected.add(least_populated)
|
||||
p+=1
|
||||
selected = np.asarray(sorted(selected))
|
||||
ytr = ytr[:,selected]
|
||||
yte = yte[:, selected]
|
||||
# else:
|
||||
# remove categories without positives in the training or test splits
|
||||
# valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
|
||||
# ytr = ytr[:, valid_categories]
|
||||
# yte = yte[:, valid_categories]
|
||||
|
||||
elif dataset_name in TC_DATASETS:
|
||||
picklepath = '/home/moreo/word-class-embeddings/pickles'
|
||||
data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
|
||||
Xtr, Xte = data.vectorize()
|
||||
ytr = data.devel_labelmatrix.todense().getA()
|
||||
yte = data.test_labelmatrix.todense().getA()
|
||||
|
||||
# remove categories with < 50 training or test documents
|
||||
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
||||
# keep the 10 most populated categories
|
||||
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
|
||||
ytr = ytr[:, to_keep]
|
||||
yte = yte[:, to_keep]
|
||||
print(f'num categories = {ytr.shape[1]}')
|
||||
|
||||
else:
|
||||
raise ValueError(f'unknown dataset {dataset_name}')
|
||||
|
||||
train = MultilabelledCollection(Xtr, ytr)
|
||||
test = MultilabelledCollection(Xte, yte)
|
||||
|
||||
if dopickle:
|
||||
os.makedirs(datadir, exist_ok=True)
|
||||
pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return train, test
|
||||
|
||||
|
||||
def already_run(result_path):
|
||||
if os.path.exists(result_path):
|
||||
print(f'{result_path} already computed. Skipping')
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_info(train, test):
|
||||
# print((np.abs(np.corrcoef(ytr, rowvar=False))>0.1).sum())
|
||||
# sys.exit(0)
|
||||
|
||||
print(f'Tr documents {len(train)}')
|
||||
print(f'Te documents {len(test)}')
|
||||
print(f'#features {train.instances.shape[1]}')
|
||||
print(f'#classes {train.labels.shape[1]}')
|
||||
|
||||
# print(f'Train-prev: {train.prevalence()[:,1]}')
|
||||
print(f'Train-counts: {train.counts()}')
|
||||
# print(f'Test-prev: {test.prevalence()[:,1]}')
|
||||
print(f'Test-counts: {test.counts()}')
|
||||
print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')
|
||||
|
||||
|
||||
def save_results(npp_results, app_results, result_path):
|
||||
# results are lists of tuples of (true_prevs, estim_prevs)
|
||||
# each true_prevs is an ndarray of ndim=2, but the second dimension is constrained
|
||||
def _prepare_result_lot(lot_results):
|
||||
true_prevs, estim_prevs = lot_results
|
||||
return {
|
||||
'true_prevs': [true_i[:,0].flatten() for true_i in true_prevs], # removes the constrained prevalence
|
||||
'estim_prevs': [estim_i[:,0].flatten() for estim_i in estim_prevs] # removes the constrained prevalence
|
||||
}
|
||||
results = {
|
||||
'npp': _prepare_result_lot(npp_results),
|
||||
'app': _prepare_result_lot(app_results),
|
||||
}
|
||||
pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def load_results(result_path):
|
||||
def _unpack_result_lot(lot_result):
|
||||
true_prevs = lot_result['true_prevs']
|
||||
true_prevs = [np.vstack([true_i, 1 - true_i]).T for true_i in true_prevs] # add the constrained prevalence
|
||||
estim_prevs = lot_result['estim_prevs']
|
||||
estim_prevs = [np.vstack([estim_i, 1 - estim_i]).T for estim_i in estim_prevs] # add the constrained prevalence
|
||||
return true_prevs, estim_prevs
|
||||
results = pickle.load(open(result_path, 'rb'))
|
||||
results = {
|
||||
'npp': _unpack_result_lot(results['npp']),
|
||||
'app': _unpack_result_lot(results['app']),
|
||||
}
|
||||
return results
|
||||
# results_npp = _unpack_result_lot(results['npp'])
|
||||
# results_app = _unpack_result_lot(results['app'])
|
||||
# return results_npp, results_app
|
||||
|
||||
|
||||
def run_experiment(dataset_name, model_name, model):
|
||||
result_path = f'{opt.results}/{dataset_name}_{model_name}.pkl'
|
||||
if already_run(result_path):
|
||||
return
|
||||
|
||||
print(f'runing experiment {dataset_name} x {model_name}')
|
||||
train, test = get_dataset(dataset_name)
|
||||
# if train.n_classes>100:
|
||||
# return
|
||||
|
||||
print_info(train, test)
|
||||
|
||||
model.fit(train)
|
||||
|
||||
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
|
||||
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
|
||||
save_results(results_npp, results_app, result_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Experiments for multi-label quantification')
|
||||
parser.add_argument('--results', type=str, default='./results', metavar='str',
|
||||
help=f'path where to store the results')
|
||||
opt = parser.parse_args()
|
||||
|
||||
os.makedirs(opt.results, exist_ok=True)
|
||||
|
||||
for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
|
||||
run_experiment(datasetname, modelname, model)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
from copy import deepcopy
|
||||
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from skmultilearn.adapt import MLTSVM
|
||||
|
||||
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
|
||||
from skmultilearn.problem_transform import LabelPowerset
|
||||
from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
|
||||
|
||||
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
|
||||
from sklearn.manifold import SpectralEmbedding
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from skmultilearn.adapt import MLkNN
|
||||
|
||||
|
||||
class MLStackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.norm = StandardScaler()
|
||||
|
||||
def fit(self, X, y):
|
||||
assert y.ndim==2, 'the dataset does not seem to be multi-label'
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class MLStackedRegressor:
|
||||
def __init__(self, base_regressor=Ridge(normalize=True)):
|
||||
self.base = deepcopy(base_regressor)
|
||||
self.meta = deepcopy(base_regressor)
|
||||
|
||||
def fit(self, X, y):
|
||||
assert y.ndim==2, 'the dataset does not seem to be multi-label'
|
||||
self.base.fit(X, y)
|
||||
R = self.base.predict(X)
|
||||
# R = self.norm.fit_transform(R)
|
||||
self.meta.fit(R, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
R = self.base.predict(X)
|
||||
# R = self.norm.transform(R)
|
||||
return self.meta.predict(R)
|
||||
|
||||
|
||||
class LabelSpacePartion:
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
|
||||
self.classifier = LabelSpacePartitioningClassifier(
|
||||
classifier=LabelPowerset(classifier=base_estimator),
|
||||
clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
|
||||
)
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.classifier.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.classifier.predict(X).todense().getA()
|
||||
|
||||
|
||||
class MLTwinSVM:
|
||||
def __init__(self):
|
||||
self.classifier = MLTSVM()
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.classifier.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.classifier.predict(X).todense().getA()
|
||||
|
||||
|
||||
class MLknn:
|
||||
#http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
|
||||
#notes: need to install package openne
|
||||
def __init__(self):
|
||||
self.classifier = EmbeddingClassifier(
|
||||
SKLearnEmbedder(SpectralEmbedding(n_components=10)),
|
||||
RandomForestRegressor(n_estimators=10),
|
||||
MLkNN(k=5)
|
||||
)
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.classifier.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.classifier.predict(X).todense().getA()
|
||||
|
||||
def predict_proba(self, X):
|
||||
return self.classifier.predict_proba(X)
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.functional import artificial_prevalence_sampling
|
||||
|
||||
from skmultilearn.model_selection import iterative_train_test_split
|
||||
|
||||
class MultilabelledCollection:
|
||||
def __init__(self, instances, labels):
|
||||
assert labels.ndim==2, f'data does not seem to be multilabel {labels}'
|
||||
self.instances = instances
|
||||
self.labels = labels
|
||||
self.classes_ = np.arange(labels.shape[1])
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return MultilabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return self.instances.shape[0]
|
||||
|
||||
def prevalence(self):
|
||||
# return self.labels.mean(axis=0)
|
||||
pos = self.labels.mean(axis=0)
|
||||
neg = 1-pos
|
||||
return np.asarray([neg, pos]).T
|
||||
|
||||
def counts(self):
|
||||
return self.labels.sum(axis=0)
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def n_features(self):
|
||||
return self.instances.shape[1]
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return False
|
||||
|
||||
def __gen_index(self):
|
||||
return np.arange(len(self))
|
||||
|
||||
def sampling_multi_index(self, size, cat, prev=None):
|
||||
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
||||
return np.random.choice(len(self), size, replace=size > len(self))
|
||||
aux = LabelledCollection(self.__gen_index(), self.labels[:, cat])
|
||||
return aux.sampling_index(size, *[1-prev, prev])
|
||||
|
||||
def uniform_sampling_multi_index(self, size):
|
||||
return np.random.choice(len(self), size, replace=size>len(self))
|
||||
|
||||
def uniform_sampling(self, size):
|
||||
unif_index = self.uniform_sampling_multi_index(size)
|
||||
return self.sampling_from_index(unif_index)
|
||||
|
||||
def sampling(self, size, category, prev=None):
|
||||
prev_index = self.sampling_multi_index(size, category, prev)
|
||||
return self.sampling_from_index(prev_index)
|
||||
|
||||
def sampling_from_index(self, index):
|
||||
documents = self.instances[index]
|
||||
labels = self.labels[index]
|
||||
return MultilabelledCollection(documents, labels)
|
||||
|
||||
def train_test_split(self, train_prop=0.6, random_state=None, iterative=False):
|
||||
if iterative:
|
||||
tr_docs, tr_labels, te_docs, te_labels = \
|
||||
iterative_train_test_split(self.instances, self.labels, test_size=1-train_prop)
|
||||
else:
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
|
||||
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||
dimensions = 2
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
|
||||
yield self.sampling(sample_size, category, prevs)
|
||||
|
||||
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
|
||||
dimensions = 2
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
|
||||
yield self.sampling_multi_index(sample_size, category, prevs)
|
||||
|
||||
def natural_sampling_generator(self, sample_size, repeats=100):
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling(sample_size)
|
||||
|
||||
def natural_sampling_index_generator(self, sample_size, repeats=100):
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling_multi_index(sample_size)
|
||||
|
||||
def asLabelledCollection(self, category):
|
||||
return LabelledCollection(self.instances, self.labels[:,category])
|
||||
|
||||
def genLabelledCollections(self):
|
||||
for c in self.classes_:
|
||||
yield self.asLabelledCollection(c)
|
||||
|
||||
# @property
|
||||
# def label_cardinality(self):
|
||||
# return self.labels.sum()/len(self)
|
||||
|
||||
@property
|
||||
def Xy(self):
|
||||
return self.instances, self.labels
|
||||
|
||||
|
||||
class MultilingualLabelledCollection:
|
||||
def __init__(self, langs:List[str], labelledCollections:List[Union[LabelledCollection, MultilabelledCollection]]):
|
||||
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
|
||||
assert all(isinstance(lc, LabelledCollection) or all(isinstance(lc, MultilabelledCollection)) for lc in labelledCollections), \
|
||||
'unexpected type for labelledCollections'
|
||||
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
|
||||
'inconsistent classes found for some labelled collections'
|
||||
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
|
||||
self.classes_=labelledCollections[0].classes_
|
||||
|
||||
@classmethod
|
||||
def fromLangDict(cls, lang_labelledCollection:dict):
|
||||
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
|
||||
|
||||
def langs(self):
|
||||
return list(sorted(self.llc.keys()))
|
||||
|
||||
def __getitem__(self, lang)->LabelledCollection:
|
||||
return self.llc[lang]
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return MultilingualLabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return sum(map(len, self.llc.values()))
|
||||
|
||||
def prevalence(self):
|
||||
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
|
||||
return prev / prev.sum()
|
||||
|
||||
def language_prevalence(self):
|
||||
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
|
||||
return lang_count / lang_count.sum()
|
||||
|
||||
def counts(self):
|
||||
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return self.n_classes == 2
|
||||
|
||||
def __check_langs(self, l_dict:dict):
|
||||
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
|
||||
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
|
||||
|
||||
def __check_sizes(self, l_sizes: Union[int,dict]):
|
||||
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
|
||||
if isinstance(l_sizes, int):
|
||||
return {l:l_sizes for l in self.langs()}
|
||||
self.__check_langs(l_sizes)
|
||||
return l_sizes
|
||||
|
||||
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||
|
||||
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
|
||||
|
||||
def uniform_sampling(self, l_sizes: Union[int, dict]):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def sampling_from_index(self, l_index:dict):
|
||||
self.__check_langs(l_index)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
|
||||
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
|
||||
|
||||
def asLabelledCollection(self, return_langs=False):
|
||||
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
|
||||
ls,Xs,ys = list(zip(*lXy_list))
|
||||
ls = np.concatenate(ls)
|
||||
vertstack = vstack if issparse(Xs[0]) else np.vstack
|
||||
Xs = vertstack(Xs)
|
||||
ys = np.concatenate(ys)
|
||||
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
|
||||
# return lc, ls if return_langs else lc
|
|
@ -0,0 +1,117 @@
|
|||
from typing import Union, Callable
|
||||
|
||||
import numpy as np
|
||||
import quapy as qp
|
||||
from MultiLabel.mlquantification import MLAggregativeQuantifier
|
||||
from mldata import MultilabelledCollection
|
||||
import itertools
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def check_error_str(error_metric):
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qp.error.from_name(error_metric)
|
||||
|
||||
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
||||
return error_metric
|
||||
|
||||
|
||||
def _ml_prevalence_predictions(model,
|
||||
test: MultilabelledCollection,
|
||||
test_indexes):
|
||||
|
||||
predict_batch_fn = _predict_quantification_batch
|
||||
if isinstance(model, MLAggregativeQuantifier):
|
||||
test = MultilabelledCollection(model.preclassify(test.instances), test.labels)
|
||||
predict_batch_fn = _predict_aggregative_batch
|
||||
|
||||
args = tuple([model, test, test_indexes])
|
||||
true_prevs, estim_prevs = predict_batch_fn(args)
|
||||
return true_prevs, estim_prevs
|
||||
|
||||
|
||||
def ml_natural_prevalence_prediction(model,
|
||||
test:MultilabelledCollection,
|
||||
sample_size,
|
||||
repeats=100,
|
||||
random_seed=42):
|
||||
|
||||
with qp.util.temp_seed(random_seed):
|
||||
test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=repeats))
|
||||
|
||||
return _ml_prevalence_predictions(model, test, test_indexes)
|
||||
|
||||
|
||||
def ml_natural_prevalence_evaluation(model,
|
||||
test:MultilabelledCollection,
|
||||
sample_size,
|
||||
repeats=100,
|
||||
error_metric:Union[str,Callable]='mae',
|
||||
random_seed=42):
|
||||
|
||||
error_metric = check_error_str(error_metric)
|
||||
|
||||
true_prevs, estim_prevs = ml_natural_prevalence_prediction(model, test, sample_size, repeats, random_seed)
|
||||
|
||||
errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)]
|
||||
return np.mean(errs)
|
||||
|
||||
|
||||
def ml_artificial_prevalence_prediction(model,
|
||||
test:MultilabelledCollection,
|
||||
sample_size,
|
||||
n_prevalences=21,
|
||||
repeats=10,
|
||||
random_seed=42):
|
||||
|
||||
nested_test_indexes = []
|
||||
with qp.util.temp_seed(random_seed):
|
||||
for cat in test.classes_:
|
||||
nested_test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size,
|
||||
category=cat,
|
||||
n_prevalences=n_prevalences,
|
||||
repeats=repeats)))
|
||||
def _predict_batch(test_indexes):
|
||||
return _ml_prevalence_predictions(model, test, test_indexes)
|
||||
|
||||
predictions = qp.util.parallel(_predict_batch, nested_test_indexes, n_jobs=-1)
|
||||
true_prevs = list(itertools.chain.from_iterable(trues for trues, estims in predictions))
|
||||
estim_prevs = list(itertools.chain.from_iterable(estims for trues, estims in predictions))
|
||||
return true_prevs, estim_prevs
|
||||
|
||||
|
||||
def ml_artificial_prevalence_evaluation(model,
|
||||
test:MultilabelledCollection,
|
||||
sample_size,
|
||||
n_prevalences=21,
|
||||
repeats=10,
|
||||
error_metric:Union[str,Callable]='mae',
|
||||
random_seed=42):
|
||||
|
||||
error_metric = check_error_str(error_metric)
|
||||
|
||||
true_prevs, estim_prevs = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences, repeats, random_seed)
|
||||
|
||||
errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)]
|
||||
return np.mean(errs)
|
||||
|
||||
|
||||
def _predict_quantification_batch(args):
|
||||
model, test, indexes = args
|
||||
return __predict_batch_fn(args, model.quantify)
|
||||
|
||||
|
||||
def _predict_aggregative_batch(args):
|
||||
model, test, indexes = args
|
||||
return __predict_batch_fn(args, model.aggregate)
|
||||
|
||||
|
||||
def __predict_batch_fn(args, quant_fn):
|
||||
model, test, indexes = args
|
||||
trues, estims = [], []
|
||||
for index in indexes:
|
||||
sample = test.sampling_from_index(index)
|
||||
estims.append(quant_fn(sample.instances))
|
||||
trues.append(sample.prevalence())
|
||||
return trues, estims
|
||||
|
|
@ -0,0 +1,361 @@
|
|||
import numpy as np
|
||||
from copy import deepcopy
|
||||
|
||||
import sklearn.preprocessing
|
||||
from sklearn.ensemble import StackingRegressor
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVC, LinearSVR
|
||||
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, MultiTaskLassoCV, LassoLars, LassoLarsCV, \
|
||||
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
|
||||
|
||||
import quapy as qp
|
||||
from MultiLabel.mlclassification import MLStackedClassifier, MLStackedRegressor
|
||||
from MultiLabel.mldata import MultilabelledCollection
|
||||
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
|
||||
from method.base import BaseQuantifier
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
class MLQuantifier:
|
||||
@abstractmethod
|
||||
def fit(self, data: MultilabelledCollection): ...
|
||||
|
||||
@abstractmethod
|
||||
def quantify(self, instances): ...
|
||||
|
||||
|
||||
class MLMLPE(MLQuantifier):
|
||||
def fit(self, data: MultilabelledCollection):
|
||||
self.tr_prev = data.prevalence()
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
return self.tr_prev
|
||||
|
||||
|
||||
class MLAggregativeQuantifier(MLQuantifier):
|
||||
def __init__(self, mlcls):
|
||||
self.learner = mlcls
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.learner.fit(*data.Xy)
|
||||
return self
|
||||
|
||||
@abstractmethod
|
||||
def preclassify(self, instances): ...
|
||||
|
||||
@abstractmethod
|
||||
def aggregate(self, predictions): ...
|
||||
|
||||
def quantify(self, instances):
|
||||
predictions = self.preclassify(instances)
|
||||
return self.aggregate(predictions)
|
||||
|
||||
|
||||
class MLCC(MLAggregativeQuantifier):
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
def aggregate(self, predictions):
|
||||
pos_prev = predictions.mean(axis=0)
|
||||
neg_prev = 1 - pos_prev
|
||||
return np.asarray([neg_prev, pos_prev]).T
|
||||
|
||||
|
||||
class MLPCC(MLCC):
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict_proba(instances)
|
||||
|
||||
|
||||
class MLACC(MLCC):
|
||||
|
||||
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
||||
self.classes_ = data.classes_
|
||||
train, val = data.train_test_split(train_prop=train_prop)
|
||||
self.learner.fit(*train.Xy)
|
||||
val_predictions = self.preclassify(val.instances)
|
||||
self.Pte_cond_estim_ = []
|
||||
for c in data.classes_:
|
||||
pos_c = val.labels[:,c].sum()
|
||||
neg_c = len(val) - pos_c
|
||||
self.Pte_cond_estim_.append(confusion_matrix(val.labels[:,c], val_predictions[:,c]).T / np.array([neg_c, pos_c]))
|
||||
return self
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
def aggregate(self, predictions):
|
||||
cc_prevs = super(MLACC, self).aggregate(predictions)
|
||||
acc_prevs = np.asarray([ACC.solve_adjustment(self.Pte_cond_estim_[c], cc_prevs[c]) for c in self.classes_])
|
||||
return acc_prevs
|
||||
|
||||
|
||||
class MLPACC(MLPCC):
|
||||
|
||||
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
||||
self.classes_ = data.classes_
|
||||
train, val = data.train_test_split(train_prop=train_prop)
|
||||
self.learner.fit(*train.Xy)
|
||||
val_posteriors = self.preclassify(val.instances)
|
||||
self.Pte_cond_estim_ = []
|
||||
for c in data.classes_:
|
||||
pos_posteriors = val_posteriors[:,c]
|
||||
c_posteriors = np.asarray([1-pos_posteriors, pos_posteriors]).T
|
||||
self.Pte_cond_estim_.append(PACC.getPteCondEstim([0,1], val.labels[:,c], c_posteriors))
|
||||
return self
|
||||
|
||||
def aggregate(self, posteriors):
|
||||
pcc_prevs = super(MLPACC, self).aggregate(posteriors)
|
||||
pacc_prevs = np.asarray([ACC.solve_adjustment(self.Pte_cond_estim_[c], pcc_prevs[c]) for c in self.classes_])
|
||||
return pacc_prevs
|
||||
|
||||
|
||||
class MLNaiveQuantifier(MLQuantifier):
|
||||
def __init__(self, q:BaseQuantifier, n_jobs=-1):
|
||||
self.q = q
|
||||
self.estimators = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
|
||||
def cat_job(lc):
|
||||
return deepcopy(self.q).fit(lc)
|
||||
|
||||
self.estimators = qp.util.parallel(cat_job, data.genLabelledCollections(), n_jobs=self.n_jobs)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
pos_prevs = np.zeros(len(self.classes_), dtype=float)
|
||||
for c in self.classes_:
|
||||
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
|
||||
neg_prevs = 1-pos_prevs
|
||||
return np.asarray([neg_prevs, pos_prevs]).T
|
||||
|
||||
|
||||
class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
|
||||
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
|
||||
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
|
||||
self.q = q
|
||||
self.estimators = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def preclassify(self, instances):
|
||||
return np.asarray([q.preclassify(instances) for q in self.estimators]).swapaxes(0,1)
|
||||
|
||||
def aggregate(self, predictions):
|
||||
pos_prevs = np.zeros(len(self.classes_), dtype=float)
|
||||
for c in self.classes_:
|
||||
pos_prevs[c] = self.estimators[c].aggregate(predictions[:,c])[1]
|
||||
neg_prevs = 1 - pos_prevs
|
||||
return np.asarray([neg_prevs, pos_prevs]).T
|
||||
|
||||
def quantify(self, instances):
|
||||
predictions = self.preclassify(instances)
|
||||
return self.aggregate(predictions)
|
||||
|
||||
|
||||
class MLRegressionQuantification:
|
||||
def __init__(self,
|
||||
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
|
||||
regression='ridge',
|
||||
protocol='npp',
|
||||
n_samples=500,
|
||||
sample_size=500,
|
||||
norm=True,
|
||||
means=True,
|
||||
stds=True):
|
||||
|
||||
assert protocol in ['npp', 'app'], 'unknown protocol'
|
||||
self.estimator = mlquantifier
|
||||
if isinstance(regression, str):
|
||||
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
elif regression == 'svr':
|
||||
self.reg = MultiOutputRegressor(LinearSVR())
|
||||
else:
|
||||
self.reg = regression
|
||||
self.protocol = protocol
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.n_samples = n_samples
|
||||
self.sample_size = sample_size
|
||||
# self.norm = StandardScaler()
|
||||
self.means = means
|
||||
self.stds = stds
|
||||
# self.covs = covs
|
||||
|
||||
def _prepare_arrays(self, Xs, ys, samples_mean, samples_std):
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
if self.means:
|
||||
samples_mean = np.asarray(samples_mean)
|
||||
Xs = np.hstack([Xs, samples_mean])
|
||||
if self.stds:
|
||||
samples_std = np.asarray(samples_std)
|
||||
Xs = np.hstack([Xs, samples_std])
|
||||
# if self.covs:
|
||||
|
||||
return Xs, ys
|
||||
|
||||
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
|
||||
ys.append(sample.prevalence()[:, 1])
|
||||
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
||||
if self.means:
|
||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
||||
if self.stds:
|
||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
||||
|
||||
def generate_samples_npp(self, val):
|
||||
Xs, ys = [], []
|
||||
samples_mean, samples_std = [], []
|
||||
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
|
||||
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
|
||||
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
||||
|
||||
|
||||
def generate_samples_app(self, val):
|
||||
Xs, ys = [], []
|
||||
samples_mean, samples_std = [], []
|
||||
ncats = len(self.classes_)
|
||||
nprevs = 21
|
||||
repeats = max(self.n_samples // (ncats * nprevs), 1)
|
||||
for cat in self.classes_:
|
||||
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
|
||||
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
|
||||
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
tr, val = data.train_test_split()
|
||||
self.estimator.fit(tr)
|
||||
if self.protocol == 'npp':
|
||||
Xs, ys = self.generate_samples_npp(val)
|
||||
elif self.protocol == 'app':
|
||||
Xs, ys = self.generate_samples_app(val)
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
self.reg.fit(Xs, ys)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
|
||||
if self.means:
|
||||
sample_mean = instances.mean(axis=0).getA()
|
||||
Xs = np.hstack([Xs, sample_mean])
|
||||
if self.stds:
|
||||
sample_std = instances.todense().std(axis=0).getA()
|
||||
Xs = np.hstack([Xs, sample_std])
|
||||
# Xs = self.norm.transform(Xs)
|
||||
Xs = self.reg.predict(Xs)
|
||||
# Xs = self.norm.inverse_transform(Xs)
|
||||
adjusted = np.clip(Xs, 0, 1)
|
||||
adjusted = adjusted.flatten()
|
||||
neg_prevs = 1-adjusted
|
||||
return np.asarray([neg_prevs, adjusted]).T
|
||||
|
||||
|
||||
class StackMLRQuantifier:
|
||||
def __init__(self,
|
||||
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
|
||||
regression='ridge',
|
||||
protocol='npp',
|
||||
n_samples=500,
|
||||
sample_size=500,
|
||||
norm=True,
|
||||
means=True,
|
||||
stds=True):
|
||||
if regression == 'ridge':
|
||||
reg = MLStackedRegressor(Ridge(normalize=True))
|
||||
elif regression == 'svr':
|
||||
reg = MLStackedRegressor(MultiOutputRegressor(LinearSVR()))
|
||||
else:
|
||||
ValueError(f'unknown regressor {regression}')
|
||||
|
||||
self.base = MLRegressionQuantification(
|
||||
mlquantifier=mlquantifier,
|
||||
regression=reg,
|
||||
protocol=protocol,
|
||||
n_samples=n_samples,
|
||||
sample_size=sample_size,
|
||||
norm=norm,
|
||||
means=means,
|
||||
stds=stds)
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.classes_ = data.classes_
|
||||
self.base.fit(data)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
return self.base.quantify(instances)
|
||||
|
||||
|
||||
class MLadjustedCount(MLAggregativeQuantifier):
|
||||
def __init__(self, learner):
|
||||
self.learner = learner
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
def fit(self, data: MultilabelledCollection, train_prop=0.6):
|
||||
self.classes_ = data.classes_
|
||||
train, val = data.train_test_split(train_prop=train_prop)
|
||||
self.learner.fit(*train.Xy)
|
||||
val_predictions = self.preclassify(val.instances)
|
||||
val_true = val.labels
|
||||
|
||||
N = len(val)
|
||||
C = val_predictions.T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
|
||||
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
|
||||
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
|
||||
|
||||
return self
|
||||
|
||||
def aggregate(self, predictions):
|
||||
P = sklearn.preprocessing.normalize(predictions, norm='l1')
|
||||
correction = P.dot(self.Pte_cond_estim_)
|
||||
adjusted = correction.mean(axis=0)
|
||||
return np.asarray([1-adjusted, adjusted]).T
|
||||
|
||||
|
||||
class MLprobAdjustedCount(MLAggregativeQuantifier):
|
||||
def __init__(self, learner):
|
||||
self.learner = learner
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict_proba(instances)
|
||||
|
||||
def fit(self, data: MultilabelledCollection, train_prop=0.6):
|
||||
self.classes_ = data.classes_
|
||||
train, val = data.train_test_split(train_prop=train_prop)
|
||||
self.learner.fit(*train.Xy)
|
||||
val_predictions = self.preclassify(val.instances)
|
||||
val_true = val.labels
|
||||
|
||||
N = len(val)
|
||||
|
||||
C = (val_predictions>0.5).T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
|
||||
# not sure...
|
||||
|
||||
|
||||
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
|
||||
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
|
||||
|
||||
return self
|
||||
|
||||
def aggregate(self, predictions):
|
||||
P = sklearn.preprocessing.normalize(predictions, norm='l1')
|
||||
correction = P.dot(self.Pte_cond_estim_)
|
||||
adjusted = correction.mean(axis=0)
|
||||
return np.asarray([1-adjusted, adjusted]).T
|
|
@ -0,0 +1,79 @@
|
|||
num categories = 10
|
||||
Train-counts: [1650 181 389 2877 433 347 538 197 369 212]
|
||||
Test-counts: [ 719 56 189 1087 149 131 179 89 117 71]
|
||||
MLPE: 0.01101
|
||||
|
||||
NPP:
|
||||
NaiveCC mae=0.01718
|
||||
NaivePCC mae=0.00898
|
||||
NaiveACC mae=0.01560
|
||||
NaivePACC mae=0.01062
|
||||
|
||||
StackCC mae=0.00790
|
||||
StackPCC mae=0.00659 **
|
||||
StackACC mae=0.00913
|
||||
StackPACC mae=0.00771
|
||||
|
||||
ChainCC mae=0.01644
|
||||
ChainPCC mae=0.00924
|
||||
ChainACC mae=0.01767
|
||||
ChainPACC mae=0.01140
|
||||
|
||||
MRQ-CC mae=0.01130
|
||||
MRQ-PCC mae=0.00941
|
||||
MRQ-ACC mae=0.01153
|
||||
MRQ-PACC mae=0.01000
|
||||
|
||||
MRQ-StackCC mae=0.00757
|
||||
MRQ-StackPCC mae=0.00652 **
|
||||
MRQ-StackACC mae=0.00799
|
||||
MRQ-StackPACC mae=0.00763
|
||||
|
||||
MRQ-StackCC-app mae=0.00791
|
||||
MRQ-StackPCC-appmae=0.00840
|
||||
MRQ-StackACC-appmae=0.00910
|
||||
MRQ-StackPACC-apmae=0.00941
|
||||
|
||||
MRQ-ChainCC mae=0.00989
|
||||
MRQ-ChainPCC mae=0.00916
|
||||
MRQ-ChainACC mae=0.01251
|
||||
MRQ-ChainPACC mae=0.00954
|
||||
|
||||
APP:
|
||||
NaiveCC mae=0.04120
|
||||
NaivePCC mae=0.03741
|
||||
NaiveACC mae=0.03202
|
||||
NaivePACC mae=0.02293
|
||||
|
||||
StackCC mae=0.01969
|
||||
StackPCC mae=0.01871
|
||||
StackACC mae=0.01386 **
|
||||
StackPACC mae=0.01267 **
|
||||
|
||||
ChainCC mae=0.04136
|
||||
ChainPCC mae=0.03571
|
||||
ChainACC mae=0.03622
|
||||
ChainPACC mae=0.02659
|
||||
|
||||
MRQ-CC mae=0.04356
|
||||
MRQ-PCC mae=0.02532
|
||||
MRQ-ACC mae=0.05716
|
||||
MRQ-PACC mae=0.02936
|
||||
|
||||
MRQ-StackCC mae=0.02448
|
||||
MRQ-StackPCC mae=0.02090
|
||||
MRQ-StackACC mae=0.02579
|
||||
MRQ-StackPACC mae=0.02388
|
||||
|
||||
MRQ-StackCC-app mae=0.01535
|
||||
MRQ-StackPCC-appmae=0.01457
|
||||
MRQ-StackACC-appmae=0.01441
|
||||
MRQ-StackPACC-apmae=0.01633
|
||||
|
||||
MRQ-ChainCC mae=0.04874
|
||||
MRQ-ChainPCC mae=0.02537
|
||||
MRQ-ChainACC mae=0.06262
|
||||
MRQ-ChainPACC mae=0.02906
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,347 @@
|
|||
import numpy as np
|
||||
import itertools
|
||||
from scipy.stats import ttest_ind_from_stats, wilcoxon
|
||||
|
||||
|
||||
class Table:
|
||||
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||
|
||||
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
|
||||
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
||||
color=True):
|
||||
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||
|
||||
self.benchmarks = np.asarray(benchmarks)
|
||||
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
||||
|
||||
self.methods = np.asarray(methods)
|
||||
self.method_index = {col: j for j, col in enumerate(methods)}
|
||||
|
||||
self.map = {}
|
||||
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||
self._addmap('values', dtype=object)
|
||||
self.lower_is_better = lower_is_better
|
||||
self.ttest = significance_test
|
||||
self.prec_mean = prec_mean
|
||||
self.clean_zero = clean_zero
|
||||
self.show_std = show_std
|
||||
self.prec_std = prec_std
|
||||
self.add_average = average
|
||||
self.missing = missing
|
||||
self.missing_str = missing_str
|
||||
self.color = color
|
||||
|
||||
self.touch()
|
||||
|
||||
@property
|
||||
def nbenchmarks(self):
|
||||
return len(self.benchmarks)
|
||||
|
||||
@property
|
||||
def nmethods(self):
|
||||
return len(self.methods)
|
||||
|
||||
def touch(self):
|
||||
self._modif = True
|
||||
|
||||
def update(self):
|
||||
if self._modif:
|
||||
self.compute()
|
||||
|
||||
def _getfilled(self):
|
||||
return np.argwhere(self.map['fill'])
|
||||
|
||||
@property
|
||||
def values(self):
|
||||
return self.map['values']
|
||||
|
||||
def _indexes(self):
|
||||
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
|
||||
|
||||
def _addmap(self, map, dtype, func=None):
|
||||
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
|
||||
if func is None:
|
||||
return
|
||||
m = self.map[map]
|
||||
f = func
|
||||
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
||||
for i, j in indexes:
|
||||
m[i, j] = f(self.values[i, j])
|
||||
|
||||
def _addrank(self):
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
|
||||
if not self.lower_is_better:
|
||||
ranked_cols_idx = ranked_cols_idx[::-1]
|
||||
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
|
||||
|
||||
def _addcolor(self):
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
if filled_cols_idx.size == 0:
|
||||
continue
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
minval = min(col_means)
|
||||
maxval = max(col_means)
|
||||
for col_idx in filled_cols_idx:
|
||||
val = self.map['mean'][i, col_idx]
|
||||
norm = (maxval - minval)
|
||||
if norm > 0:
|
||||
normval = (val - minval) / norm
|
||||
else:
|
||||
normval = 0.5
|
||||
if self.lower_is_better:
|
||||
normval = 1 - normval
|
||||
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
||||
|
||||
def _run_ttest(self, row, col1, col2):
|
||||
mean1 = self.map['mean'][row, col1]
|
||||
std1 = self.map['std'][row, col1]
|
||||
nobs1 = self.map['nobs'][row, col1]
|
||||
mean2 = self.map['mean'][row, col2]
|
||||
std2 = self.map['std'][row, col2]
|
||||
nobs2 = self.map['nobs'][row, col2]
|
||||
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
|
||||
return p_val
|
||||
|
||||
def _run_wilcoxon(self, row, col1, col2):
|
||||
values1 = self.map['values'][row, col1]
|
||||
values2 = self.map['values'][row, col2]
|
||||
_, p_val = wilcoxon(values1, values2)
|
||||
return p_val
|
||||
|
||||
def _add_statistical_test(self):
|
||||
if self.ttest is None:
|
||||
return
|
||||
self.some_similar = [False] * self.nmethods
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
if len(filled_cols_idx) <= 1:
|
||||
continue
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
best_pos = filled_cols_idx[np.argmin(col_means)]
|
||||
|
||||
for j in filled_cols_idx:
|
||||
if j == best_pos:
|
||||
continue
|
||||
if self.ttest == 'ttest':
|
||||
p_val = self._run_ttest(i, best_pos, j)
|
||||
else:
|
||||
p_val = self._run_wilcoxon(i, best_pos, j)
|
||||
|
||||
pval_outcome = pval_interpretation(p_val)
|
||||
self.map['ttest'][i, j] = pval_outcome
|
||||
if pval_outcome != 'Diff':
|
||||
self.some_similar[j] = True
|
||||
|
||||
def compute(self):
|
||||
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
|
||||
self._addmap('mean', dtype=float, func=np.mean)
|
||||
self._addmap('std', dtype=float, func=np.std)
|
||||
self._addmap('nobs', dtype=float, func=len)
|
||||
self._addmap('rank', dtype=int, func=None)
|
||||
self._addmap('color', dtype=object, func=None)
|
||||
self._addmap('ttest', dtype=object, func=None)
|
||||
self._addmap('latex', dtype=object, func=None)
|
||||
self._addrank()
|
||||
self._addcolor()
|
||||
self._add_statistical_test()
|
||||
if self.add_average:
|
||||
self._addave()
|
||||
self._modif = False
|
||||
|
||||
def _is_column_full(self, col):
|
||||
return all(self.map['fill'][:, self.method_index[col]])
|
||||
|
||||
def _addave(self):
|
||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
||||
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
||||
show_std=self.show_std)
|
||||
for col in self.methods:
|
||||
values = None
|
||||
if self._is_column_full(col):
|
||||
if self.ttest == 'ttest':
|
||||
values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
||||
else: # wilcoxon
|
||||
values = np.concatenate(self.values[:, self.method_index[col]])
|
||||
ave.add('ave', col, values)
|
||||
self.average = ave
|
||||
|
||||
def add(self, benchmark, method, values):
|
||||
if values is not None:
|
||||
values = np.asarray(values)
|
||||
if values.ndim == 0:
|
||||
values = values.flatten()
|
||||
rid, cid = self._coordinates(benchmark, method)
|
||||
if self.map['values'][rid, cid] is None:
|
||||
self.map['values'][rid, cid] = values
|
||||
elif values is not None:
|
||||
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
|
||||
self.touch()
|
||||
|
||||
def get(self, benchmark, method, attr='mean'):
|
||||
self.update()
|
||||
assert attr in self.map, f'unknwon attribute {attr}'
|
||||
rid, cid = self._coordinates(benchmark, method)
|
||||
if self.map['fill'][rid, cid]:
|
||||
v = self.map[attr][rid, cid]
|
||||
if v is None or (isinstance(v, float) and np.isnan(v)):
|
||||
return self.missing
|
||||
return v
|
||||
else:
|
||||
return self.missing
|
||||
|
||||
def _coordinates(self, benchmark, method):
|
||||
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
||||
assert method in self.method_index, f'method {method} out of range'
|
||||
rid = self.benchmark_index[benchmark]
|
||||
cid = self.method_index[method]
|
||||
return rid, cid
|
||||
|
||||
def get_average(self, method, attr='mean'):
|
||||
self.update()
|
||||
if self.add_average:
|
||||
return self.average.get('ave', method, attr=attr)
|
||||
return None
|
||||
|
||||
def get_color(self, benchmark, method):
|
||||
color = self.get(benchmark, method, attr='color')
|
||||
if color is None:
|
||||
return ''
|
||||
return color
|
||||
|
||||
def latexCell(self, benchmark, method):
|
||||
self.update()
|
||||
i, j = self._coordinates(benchmark, method)
|
||||
if self.map['fill'][i, j] == False:
|
||||
return self.missing_str
|
||||
|
||||
mean = self.map['mean'][i, j]
|
||||
l = f" {mean:.{self.prec_mean}f}"
|
||||
if self.clean_zero:
|
||||
l = l.replace(' 0.', '.')
|
||||
|
||||
isbest = self.map['rank'][i, j] == 1
|
||||
if isbest:
|
||||
l = "\\textbf{" + l.strip() + "}"
|
||||
|
||||
stat = ''
|
||||
if self.ttest is not None and self.some_similar[j]:
|
||||
test_label = self.map['ttest'][i, j]
|
||||
if test_label == 'Sim':
|
||||
stat = '^{\dag\phantom{\dag}}'
|
||||
elif test_label == 'Same':
|
||||
stat = '^{\ddag}'
|
||||
elif isbest or test_label == 'Diff':
|
||||
stat = '^{\phantom{\ddag}}'
|
||||
|
||||
std = ''
|
||||
if self.show_std:
|
||||
std = self.map['std'][i, j]
|
||||
std = f" {std:.{self.prec_std}f}"
|
||||
if self.clean_zero:
|
||||
std = std.replace(' 0.', '.')
|
||||
std = f" \pm {std:{self.prec_std}}"
|
||||
|
||||
if stat != '' or std != '':
|
||||
l = f'{l}${stat}{std}$'
|
||||
|
||||
if self.color:
|
||||
l += ' ' + self.map['color'][i, j]
|
||||
|
||||
return l
|
||||
|
||||
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
|
||||
tab = ' & '
|
||||
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
|
||||
tab += ' \\\\\hline\n'
|
||||
for row in self.benchmarks:
|
||||
rowname = benchmark_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRow(row)
|
||||
|
||||
if average:
|
||||
tab += '\hline\n'
|
||||
tab += 'Average & '
|
||||
tab += self.latexAverage()
|
||||
return tab
|
||||
|
||||
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
||||
def withside(label):
|
||||
return '\side{'+label+'}' if side else label
|
||||
|
||||
tab = ' & '
|
||||
tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
|
||||
if average:
|
||||
tab += ' & ' + withside('Ave')
|
||||
tab += ' \\\\\hline\n'
|
||||
for row in self.methods:
|
||||
rowname = method_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRowT(row, endl='')
|
||||
if average:
|
||||
tab += ' & '
|
||||
tab += self.average.latexCell('ave', row)
|
||||
tab += '\\\\\hline\n'
|
||||
return tab
|
||||
|
||||
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, col) for col in self.methods]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexRowT(self, method, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexAverage(self, endl='\\\\\hline\n'):
|
||||
if self.add_average:
|
||||
return self.average.latexRow('ave', endl=endl)
|
||||
|
||||
def getRankTable(self):
|
||||
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
|
||||
for rid, cid in self._getfilled():
|
||||
row = self.benchmarks[rid]
|
||||
col = self.methods[cid]
|
||||
t.add(row, col, self.get(row, col, 'rank'))
|
||||
t.compute()
|
||||
return t
|
||||
|
||||
def dropMethods(self, methods):
|
||||
drop_index = [self.method_index[m] for m in methods]
|
||||
new_methods = np.delete(self.methods, drop_index)
|
||||
new_index = {col: j for j, col in enumerate(new_methods)}
|
||||
|
||||
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
|
||||
self.methods = new_methods
|
||||
self.method_index = new_index
|
||||
self.touch()
|
||||
|
||||
|
||||
def pval_interpretation(p_val):
|
||||
if 0.005 >= p_val:
|
||||
return 'Diff'
|
||||
elif 0.05 >= p_val > 0.005:
|
||||
return 'Sim'
|
||||
elif p_val > 0.05:
|
||||
return 'Same'
|
||||
|
||||
|
||||
def color_red2green_01(val, maxtone=50):
|
||||
if np.isnan(val): return None
|
||||
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
|
||||
|
||||
# rescale to [-1,1]
|
||||
val = val * 2 - 1
|
||||
if val < 0:
|
||||
color = 'red'
|
||||
tone = maxtone * (-val)
|
||||
else:
|
||||
color = 'green'
|
||||
tone = maxtone * val
|
||||
return '\cellcolor{' + color + f'!{int(tone)}' + '}'
|
|
@ -0,0 +1,145 @@
|
|||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from scipy.sparse import vstack, issparse
|
||||
from joblib import Parallel, delayed
|
||||
import multiprocessing
|
||||
import itertools
|
||||
|
||||
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
"""
|
||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
||||
:param data: list of string documents
|
||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
||||
are not in the original vocab but that are in the known_words
|
||||
:return:
|
||||
"""
|
||||
indexes=[]
|
||||
vocabsize = len(vocab)
|
||||
unk_count = 0
|
||||
knw_count = 0
|
||||
out_count = 0
|
||||
pbar = tqdm(data, desc=f'indexing documents')
|
||||
for text in pbar:
|
||||
words = analyzer(text)
|
||||
index = []
|
||||
for word in words:
|
||||
if word in vocab:
|
||||
idx = vocab[word]
|
||||
else:
|
||||
if word in known_words:
|
||||
if word not in out_of_vocabulary:
|
||||
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
||||
idx = out_of_vocabulary[word]
|
||||
out_count += 1
|
||||
else:
|
||||
idx = unk_index
|
||||
unk_count += 1
|
||||
index.append(idx)
|
||||
indexes.append(index)
|
||||
knw_count += len(index)
|
||||
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
||||
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
||||
return indexes
|
||||
|
||||
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths)+np.std(lengths))
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i,indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
def get_word_list(word2index1, word2index2=None): #TODO: redo
|
||||
def extract_word_list(word2index):
|
||||
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
|
||||
word_list = extract_word_list(word2index1)
|
||||
if word2index2 is not None:
|
||||
word_list += extract_word_list(word2index2)
|
||||
return word_list
|
||||
|
||||
|
||||
def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500):
|
||||
nsamples = len(index_list)
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
batch = index_list[b*batchsize:(b+1)*batchsize]
|
||||
batch_labels = labels[b*batchsize:(b+1)*batchsize]
|
||||
if issparse(batch_labels):
|
||||
batch_labels = batch_labels.toarray()
|
||||
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
totype = torch.LongTensor if target_long else torch.FloatTensor
|
||||
target = totype(batch_labels)
|
||||
yield batch.to(device), target.to(device)
|
||||
|
||||
|
||||
def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500):
|
||||
nsamples = len(index_list)
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
batch = index_list[b*batchsize:(b+1)*batchsize]
|
||||
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
yield batch.to(device)
|
||||
|
||||
|
||||
def clip_gradient(model, clip_value=1e-1):
|
||||
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
||||
for p in params:
|
||||
p.grad.data.clamp_(-clip_value, clip_value)
|
||||
|
||||
|
||||
def predict(logits, classification_type='singlelabel'):
|
||||
if classification_type == 'multilabel':
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == 'singlelabel':
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print('unknown classification type')
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def get_parallel_slices(n_tasks, n_jobs=-1):
|
||||
if n_jobs==-1:
|
||||
n_jobs = multiprocessing.cpu_count()
|
||||
batch = int(n_tasks / n_jobs)
|
||||
remainder = n_tasks % n_jobs
|
||||
return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
|
||||
|
||||
|
||||
def tokenize_job(documents, tokenizer, max_tokens, job):
|
||||
return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')]
|
||||
|
||||
|
||||
def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1):
|
||||
slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs)
|
||||
tokens = Parallel(n_jobs=n_jobs)(
|
||||
delayed(tokenize_job)(
|
||||
documents[slice_i], tokenizer, max_tokens, job
|
||||
)
|
||||
for job, slice_i in enumerate(slices)
|
||||
)
|
||||
return list(itertools.chain.from_iterable(tokens))
|
||||
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
|
||||
class CSVLog:
|
||||
|
||||
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
|
||||
self.file = file
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file) and not overwrite:
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
self.columns = sorted(self.df.columns.values.tolist())
|
||||
else:
|
||||
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
|
||||
assert columns is not None, 'columns cannot be None'
|
||||
self.columns = sorted(columns)
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
self.defaults = {}
|
||||
|
||||
def already_calculated(self, **kwargs):
|
||||
df = self.df
|
||||
if df.shape[0] == 0:
|
||||
return False
|
||||
if len(kwargs) == 0:
|
||||
kwargs = self.defaults
|
||||
for key,val in kwargs.items():
|
||||
df = df.loc[df[key] == val]
|
||||
if df.shape[0] == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
def set_default(self, param, value):
|
||||
self.defaults[param] = value
|
||||
|
||||
def add_row(self, **kwargs):
|
||||
for key in self.defaults.keys():
|
||||
if key not in kwargs:
|
||||
kwargs[key]=self.defaults[key]
|
||||
colums = sorted(list(kwargs.keys()))
|
||||
values = [kwargs[col_i] for col_i in colums]
|
||||
s = pd.Series(values, index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(kwargs)
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
from data.dataset import Dataset
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
def write_data(documents, labels, fout):
|
||||
print(f'there are {len(documents)} documents')
|
||||
written, empty = 0, 0
|
||||
with open(fout, 'wt') as foo:
|
||||
for doc, label in tqdm(list(zip(documents, labels))):
|
||||
doc = doc.replace('\t', ' ').replace('\n', ' ').strip()
|
||||
label = np.squeeze(np.asarray(label.todense()))
|
||||
label = ' '.join([f'{x}' for x in label])
|
||||
if doc:
|
||||
foo.write(f'{label}\t{doc}\n')
|
||||
written += 1
|
||||
else:
|
||||
foo.write(f'{label}\tempty document\n')
|
||||
empty += 1
|
||||
print(f'written = {written}')
|
||||
print(f'empty = {empty}')
|
||||
|
||||
|
||||
for dataset_name in ['reuters21578', 'ohsumed', 'jrcall', 'rcv1', 'wipo-sl-sc']: #'20newsgroups'
|
||||
|
||||
dataset = Dataset.load(dataset_name=dataset_name, pickle_path=f'../pickles/{dataset_name}.pickle').show()
|
||||
|
||||
os.makedirs(f'../leam/{dataset_name}', exist_ok=True)
|
||||
write_data(dataset.devel_raw, dataset.devel_labelmatrix, f'../leam/{dataset_name}/train.csv')
|
||||
#write_data(dataset.test_raw, dataset.test_labelmatrix, f'../leam/{dataset_name}/test.csv')
|
||||
print('done')
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
def warn(*args, **kwargs): pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
|
@ -0,0 +1,54 @@
|
|||
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
|
||||
import torch
|
||||
from time import time
|
||||
from util.file import create_if_not_exist
|
||||
|
||||
|
||||
class EarlyStopping:
|
||||
|
||||
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||
self.patience_limit = patience
|
||||
self.patience = patience
|
||||
self.verbose = verbose
|
||||
self.best_score = None
|
||||
self.best_epoch = None
|
||||
self.stop_time = None
|
||||
self.checkpoint = checkpoint
|
||||
self.model = model
|
||||
self.STOP = False
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
|
||||
if self.STOP:
|
||||
return #done
|
||||
|
||||
if self.best_score is None or watch_score >= self.best_score:
|
||||
self.best_score = watch_score
|
||||
self.best_epoch = epoch
|
||||
self.stop_time = time()
|
||||
if self.checkpoint:
|
||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||
torch.save(self.model, self.checkpoint)
|
||||
else:
|
||||
self.print(f'[early-stop] improved')
|
||||
self.patience = self.patience_limit
|
||||
else:
|
||||
self.patience -= 1
|
||||
if self.patience == 0:
|
||||
self.STOP = True
|
||||
self.print(f'[early-stop] patience exhausted')
|
||||
else:
|
||||
if self.patience>0: # if negative, then early-stop is ignored
|
||||
self.print(f'[early-stop] patience={self.patience}')
|
||||
|
||||
def reinit_counter(self):
|
||||
self.STOP = False
|
||||
self.patience=self.patience_limit
|
||||
|
||||
def restore_checkpoint(self):
|
||||
return torch.load(self.checkpoint)
|
||||
|
||||
def print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
|
@ -0,0 +1,38 @@
|
|||
import urllib.request
|
||||
from os import listdir, makedirs
|
||||
from os.path import isdir, isfile, join, exists, dirname
|
||||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
print("Downloading %s" % url)
|
||||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||
print("")
|
||||
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if exists(archive_path): return
|
||||
create_if_not_exist(dirname(archive_path))
|
||||
download_file(url,archive_path)
|
||||
|
||||
|
||||
def ls(dir, typecheck):
|
||||
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
|
||||
el.sort()
|
||||
return el
|
||||
|
||||
|
||||
def list_dirs(dir):
|
||||
return ls(dir, typecheck=isdir)
|
||||
|
||||
|
||||
def list_files(dir):
|
||||
return ls(dir, typecheck=isfile)
|
||||
|
||||
|
||||
def create_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import lil_matrix, issparse
|
||||
from sklearn.metrics import f1_score, accuracy_score
|
||||
|
||||
|
||||
"""
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
|
||||
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
"""
|
||||
|
||||
def evaluation(y_true, y_pred, classification_type):
|
||||
|
||||
if classification_type == 'multilabel':
|
||||
eval_function = multilabel_eval
|
||||
elif classification_type == 'singlelabel':
|
||||
eval_function = singlelabel_eval
|
||||
|
||||
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
|
||||
|
||||
return Mf1, mf1, accuracy
|
||||
|
||||
|
||||
def multilabel_eval(y, y_):
|
||||
|
||||
tp = y.multiply(y_)
|
||||
|
||||
fn = lil_matrix(y.shape)
|
||||
true_ones = y==1
|
||||
fn[true_ones]=1-tp[true_ones]
|
||||
|
||||
fp = lil_matrix(y.shape)
|
||||
pred_ones = y_==1
|
||||
if pred_ones.nnz>0:
|
||||
fp[pred_ones]=1-tp[pred_ones]
|
||||
|
||||
#macro-f1
|
||||
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
|
||||
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
|
||||
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
|
||||
|
||||
pos_pred = tp_macro+fp_macro
|
||||
pos_true = tp_macro+fn_macro
|
||||
prec=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
rec=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
|
||||
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
|
||||
den=prec+rec
|
||||
|
||||
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
|
||||
macrof1 *=2
|
||||
|
||||
macrof1[(pos_pred==0)*(pos_true==0)]=1
|
||||
macrof1 = np.mean(macrof1)
|
||||
|
||||
#micro-f1
|
||||
tp_micro = tp_macro.sum()
|
||||
fn_micro = fn_macro.sum()
|
||||
fp_micro = fp_macro.sum()
|
||||
pos_pred = tp_micro + fp_micro
|
||||
pos_true = tp_micro + fn_micro
|
||||
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
|
||||
rec = (tp_micro / pos_true) if pos_true>0 else 0
|
||||
den = prec+rec
|
||||
microf1 = 2*prec*rec/den if den>0 else 0
|
||||
if pos_pred==pos_true==0:
|
||||
microf1=1
|
||||
|
||||
#accuracy
|
||||
ndecisions = np.multiply(*y.shape)
|
||||
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
|
||||
acc = (tp_micro+tn)/ndecisions
|
||||
|
||||
return macrof1,microf1,acc
|
||||
|
||||
|
||||
def singlelabel_eval(y, y_):
|
||||
if issparse(y_): y_ = y_.toarray().flatten()
|
||||
macrof1 = f1_score(y, y_, average='macro')
|
||||
microf1 = f1_score(y, y_, average='micro')
|
||||
acc = accuracy_score(y, y_)
|
||||
return macrof1,microf1,acc
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
from sklearn.svm import LinearSVC
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from time import time
|
||||
|
||||
|
||||
class MLSVC:
|
||||
"""
|
||||
Multi-Label Support Vector Machine, with individual optimizations per binary problem.
|
||||
"""
|
||||
|
||||
def __init__(self, n_jobs=1, estimator=LinearSVC, *args, **kwargs):
|
||||
self.n_jobs = n_jobs
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
self.verbose = False if 'verbose' not in self.kwargs else self.kwargs['verbose']
|
||||
self.estimator = estimator
|
||||
|
||||
|
||||
def fit(self, X, y, **grid_search_params):
|
||||
tini = time()
|
||||
assert len(y.shape)==2 and set(np.unique(y).tolist()) == {0,1}, 'data format is not multi-label'
|
||||
nD,nC = y.shape
|
||||
prevalence = np.sum(y, axis=0)
|
||||
self.svms = np.array([self.estimator(*self.args, **self.kwargs) for _ in range(nC)])
|
||||
if grid_search_params and grid_search_params['param_grid']:
|
||||
self._print('grid_search activated with: {}'.format(grid_search_params))
|
||||
# Grid search cannot be performed if the category prevalence is less than the parameter cv.
|
||||
# In those cases we place a svm instead of a gridsearchcv
|
||||
cv = 5 if 'cv' not in grid_search_params else grid_search_params['cv']
|
||||
assert isinstance(cv, int), 'cv must be an int (other policies are not supported yet)'
|
||||
self.svms = [GridSearchCV(svm_i, refit=True, **grid_search_params) if prevalence[i]>=cv else svm_i
|
||||
for i,svm_i in enumerate(self.svms)]
|
||||
for i in np.argwhere(prevalence==0).flatten():
|
||||
self.svms[i] = TrivialRejector()
|
||||
|
||||
self.svms = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(self.svms[c].fit)(X,y[:,c]) for c,svm in enumerate(self.svms)
|
||||
)
|
||||
self.training_time = time() - tini
|
||||
|
||||
|
||||
def predict(self, X):
|
||||
return np.vstack(list(map(lambda svmi: svmi.predict(X), self.svms))).T
|
||||
|
||||
|
||||
def predict_proba(self, X):
|
||||
return np.vstack(map(lambda svmi: svmi.predict_proba(X)[:,np.argwhere(svmi.classes_==1)[0,0]], self.svms)).T
|
||||
|
||||
|
||||
def _print(self, msg):
|
||||
if self.verbose>0:
|
||||
print(msg)
|
||||
|
||||
|
||||
def best_params(self):
|
||||
return [svmi.best_params_ if isinstance(svmi, GridSearchCV) else None for svmi in self.svms]
|
||||
|
||||
|
||||
class TrivialRejector:
|
||||
def fit(self,*args,**kwargs): return self
|
||||
def predict(self, X): return np.zeros(X.shape[0])
|
||||
def predict_proba(self, X): return np.zeros(X.shape[0])
|
||||
|
|
@ -176,104 +176,6 @@ class LabelledCollection:
|
|||
yield train, test
|
||||
|
||||
|
||||
class MultilingualLabelledCollection:
|
||||
def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
|
||||
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
|
||||
assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
|
||||
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
|
||||
'inconsistent classes found for some labelled collections'
|
||||
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
|
||||
self.classes_=labelledCollections[0].classes_
|
||||
|
||||
@classmethod
|
||||
def fromLangDict(cls, lang_labelledCollection:dict):
|
||||
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
|
||||
|
||||
def langs(self):
|
||||
return list(sorted(self.llc.keys()))
|
||||
|
||||
def __getitem__(self, lang)->LabelledCollection:
|
||||
return self.llc[lang]
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return MultilingualLabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return sum(map(len, self.llc.values()))
|
||||
|
||||
def prevalence(self):
|
||||
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
|
||||
return prev / prev.sum()
|
||||
|
||||
def language_prevalence(self):
|
||||
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
|
||||
return lang_count / lang_count.sum()
|
||||
|
||||
def counts(self):
|
||||
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return self.n_classes == 2
|
||||
|
||||
def __check_langs(self, l_dict:dict):
|
||||
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
|
||||
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
|
||||
|
||||
def __check_sizes(self, l_sizes: Union[int,dict]):
|
||||
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
|
||||
if isinstance(l_sizes, int):
|
||||
return {l:l_sizes for l in self.langs()}
|
||||
self.__check_langs(l_sizes)
|
||||
return l_sizes
|
||||
|
||||
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||
|
||||
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
|
||||
|
||||
def uniform_sampling(self, l_sizes: Union[int, dict]):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def sampling_from_index(self, l_index:dict):
|
||||
self.__check_langs(l_index)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
|
||||
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
|
||||
|
||||
def asLabelledCollection(self, return_langs=False):
|
||||
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
|
||||
ls,Xs,ys = list(zip(*lXy_list))
|
||||
ls = np.concatenate(ls)
|
||||
vertstack = vstack if issparse(Xs[0]) else np.vstack
|
||||
Xs = vertstack(Xs)
|
||||
ys = np.concatenate(ys)
|
||||
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
|
||||
# return lc, ls if return_langs else lc
|
||||
#
|
||||
#
|
||||
#
|
||||
class Dataset:
|
||||
|
||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||
|
|
|
@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix
|
|||
from tqdm import tqdm
|
||||
|
||||
|
||||
def from_rcv2_lang_file(path, encoding='utf-8'):
|
||||
lines = open(path, 'rt', encoding=encoding).readlines()
|
||||
parts = [l.split('\t') for l in lines]
|
||||
docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts]))
|
||||
return docs, cats
|
||||
|
||||
|
||||
def from_text(path, encoding='utf-8'):
|
||||
"""
|
||||
Reas a labelled colletion of documents.
|
||||
|
|
|
@ -105,7 +105,7 @@ def _predict_from_indexes(
|
|||
estim_prevalence = quantification_func(sample.instances)
|
||||
return true_prevalence, estim_prevalence
|
||||
|
||||
pbar = tqdm(indexes, desc='[artificial sampling protocol] generating predictions') if verbose else indexes
|
||||
pbar = tqdm(indexes, desc='[sampling protocol] generating predictions') if verbose else indexes
|
||||
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
|
||||
|
||||
true_prevalences, estim_prevalences = zip(*results)
|
||||
|
|
|
@ -37,6 +37,9 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
def learner(self, value):
|
||||
self.learner_ = value
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.classify(instances)
|
||||
|
||||
def classify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
|
@ -74,6 +77,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
|||
probabilities.
|
||||
"""
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.predict_proba(instances)
|
||||
|
||||
def posterior_probabilities(self, instances):
|
||||
return self.learner.predict_proba(instances)
|
||||
|
||||
|
@ -316,6 +322,12 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
|
||||
self.pcc = PCC(self.learner)
|
||||
|
||||
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
|
||||
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def getPteCondEstim(cls, classes, y, y_):
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
n_classes = len(classes)
|
||||
|
@ -323,9 +335,7 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
for i, class_ in enumerate(classes):
|
||||
confusion[i] = y_[y == class_].mean(axis=0)
|
||||
|
||||
self.Pte_cond_estim_ = confusion.T
|
||||
|
||||
return self
|
||||
return confusion.T
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
||||
|
@ -785,7 +795,7 @@ class OneVsAll(AggregativeQuantifier):
|
|||
return self.binary_quantifier.get_params()
|
||||
|
||||
def _delayed_binary_classification(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].classify(X)
|
||||
return self.dict_binary_quantifiers[c].preclassify(X)
|
||||
|
||||
def _delayed_binary_posteriors(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].posterior_probabilities(X)
|
||||
|
|
|
@ -27,7 +27,7 @@ class BaseQuantifier(metaclass=ABCMeta):
|
|||
# based on class structure
|
||||
@property
|
||||
def binary(self):
|
||||
return False
|
||||
return len(self.classes_)==2
|
||||
|
||||
@property
|
||||
def aggregative(self):
|
||||
|
|
|
@ -227,7 +227,7 @@ def _delayed_new_instance(args):
|
|||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
||||
data, val_split = data.split_stratified(train_prop=1 - val_split)
|
||||
data, val_split = data.train_test_split(train_prop=1 - val_split)
|
||||
|
||||
sample_index = data.sampling_index(sample_size, *prev)
|
||||
sample = data.sampling_from_index(sample_index)
|
||||
|
|
|
@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
|
||||
if fit_learner:
|
||||
classifier_data, unused_data = data.split_stratified(0.4)
|
||||
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||
train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||
self.learner.fit(*classifier_data.Xy)
|
||||
else:
|
||||
classifier_data = None
|
||||
|
@ -87,8 +87,9 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
train_posteriors = self.learner.predict_proba(train_data.instances)
|
||||
|
||||
# turn instances' original representations into embeddings
|
||||
valid_data.instances = self.learner.transform(valid_data.instances)
|
||||
train_data.instances = self.learner.transform(train_data.instances)
|
||||
|
||||
valid_data_embed = LabelledCollection(self.learner.transform(valid_data.instances), valid_data.labels, self._classes_)
|
||||
train_data_embed = LabelledCollection(self.learner.transform(train_data.instances), train_data.labels, self._classes_)
|
||||
|
||||
self.quantifiers = {
|
||||
'cc': CC(self.learner).fit(None, fit_learner=False),
|
||||
|
@ -110,9 +111,9 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
nQ = len(self.quantifiers)
|
||||
nC = data.n_classes
|
||||
self.quanet = QuaNetModule(
|
||||
doc_embedding_size=train_data.instances.shape[1],
|
||||
doc_embedding_size=train_data_embed.instances.shape[1],
|
||||
n_classes=data.n_classes,
|
||||
stats_size=nQ*nC, #+ 2*nC*nC,
|
||||
stats_size=nQ*nC,
|
||||
order_by=0 if data.binary else None,
|
||||
**self.quanet_params
|
||||
).to(self.device)
|
||||
|
@ -124,8 +125,8 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
checkpoint = self.checkpoint
|
||||
|
||||
for epoch_i in range(1, self.n_epochs):
|
||||
self.epoch(train_data, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
|
||||
self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
|
||||
self.epoch(train_data_embed, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
|
||||
self.epoch(valid_data_embed, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
|
||||
|
||||
early_stop(self.status['va-loss'], epoch_i)
|
||||
if early_stop.IMPROVED:
|
||||
|
|
|
@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
return training, validation
|
||||
elif isinstance(validation, float):
|
||||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
||||
training, validation = training.train_test_split(train_prop=1 - validation)
|
||||
return training, validation
|
||||
else:
|
||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||
|
|
Loading…
Reference in New Issue