diff --git a/MultiLabel/data/__init__.py b/MultiLabel/data/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/MultiLabel/data/dataset.py b/MultiLabel/data/dataset.py new file mode 100755 index 0000000..98b43fc --- /dev/null +++ b/MultiLabel/data/dataset.py @@ -0,0 +1,229 @@ +import os,sys +from sklearn.datasets import get_data_home, fetch_20newsgroups +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import MultiLabelBinarizer +from jrcacquis_reader import fetch_jrcacquis, JRCAcquis_Document +from ohsumed_reader import fetch_ohsumed50k +from reuters21578_reader import fetch_reuters21578 +from rcv_reader import fetch_RCV1 +from wipo_reader import fetch_WIPOgamma, WipoGammaDocument +import pickle +import numpy as np +from tqdm import tqdm +from os.path import join +import re + + +def init_vectorizer(): + return TfidfVectorizer(min_df=5, sublinear_tf=True) + + +class Dataset: + + dataset_available = {'reuters21578', '20newsgroups', 'ohsumed', 'rcv1', 'ohsumed', 'jrcall', + 'wipo-sl-mg','wipo-ml-mg','wipo-sl-sc','wipo-ml-sc'} + + def __init__(self, name): + assert name in Dataset.dataset_available, f'dataset {name} is not available' + if name=='reuters21578': + self._load_reuters() + elif name == '20newsgroups': + self._load_20news() + elif name == 'rcv1': + self._load_rcv1() + elif name == 'ohsumed': + self._load_ohsumed() + elif name == 'jrcall': + self._load_jrc(version='all') + elif name == 'wipo-sl-mg': + self._load_wipo('singlelabel', 'maingroup') + elif name == 'wipo-ml-mg': + self._load_wipo('multilabel', 'maingroup') + elif name == 'wipo-sl-sc': + self._load_wipo('singlelabel', 'subclass') + elif name == 'wipo-ml-sc': + self._load_wipo('multilabel', 'subclass') + + self.nC = self.devel_labelmatrix.shape[1] + self._vectorizer = init_vectorizer() + self._vectorizer.fit(self.devel_raw) + self.vocabulary = self._vectorizer.vocabulary_ + + def show(self): + nTr_docs = len(self.devel_raw) + nTe_docs = len(self.test_raw) + nfeats = len(self._vectorizer.vocabulary_) + nC = self.devel_labelmatrix.shape[1] + nD=nTr_docs+nTe_docs + print(f'{self.classification_type}, nD={nD}=({nTr_docs}+{nTe_docs}), nF={nfeats}, nC={nC}') + return self + + def _load_reuters(self): + data_path = os.path.join(get_data_home(), 'reuters21578') + devel = fetch_reuters21578(subset='train', data_path=data_path) + test = fetch_reuters21578(subset='test', data_path=data_path) + + self.classification_type = 'multilabel' + self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data) + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target) + self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix + + def _load_rcv1(self): + data_path = '../datasets/RCV1-v2/unprocessed_corpus' #TODO: check when missing + devel = fetch_RCV1(subset='train', data_path=data_path) + test = fetch_RCV1(subset='test', data_path=data_path) + + self.classification_type = 'multilabel' + self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data) + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target) + self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix + + def _load_jrc(self, version): + assert version in ['300','all'], 'allowed versions are "300" or "all"' + data_path = "../datasets/JRC_Acquis_v3" + tr_years=list(range(1986, 2006)) + te_years=[2006] + if version=='300': + training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1,most_frequent=300) + test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats) + else: + training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1) + test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats) + print(f'load jrc-acquis (English) with {len(tr_cats)} tr categories ({len(te_cats)} te categories)') + + devel_data = JRCAcquis_Document.get_text(training_docs) + test_data = JRCAcquis_Document.get_text(test_docs) + devel_target = JRCAcquis_Document.get_target(training_docs) + test_target = JRCAcquis_Document.get_target(test_docs) + + self.classification_type = 'multilabel' + self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data) + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target) + self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix + + def _load_ohsumed(self): + data_path = os.path.join(get_data_home(), 'ohsumed50k') + devel = fetch_ohsumed50k(subset='train', data_path=data_path) + test = fetch_ohsumed50k(subset='test', data_path=data_path) + + self.classification_type = 'multilabel' + self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data) + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target) + self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix + + def _load_20news(self): + metadata = ('headers', 'footers', 'quotes') + devel = fetch_20newsgroups(subset='train', remove=metadata) + test = fetch_20newsgroups(subset='test', remove=metadata) + self.classification_type = 'singlelabel' + self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data) + self.devel_target, self.test_target = devel.target, test.target + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1,1), self.test_target.reshape(-1,1)) + + def _load_fasttext_data(self,name): + data_path='../datasets/fastText' + self.classification_type = 'singlelabel' + name=name.replace('-','_') + train_file = join(data_path,f'{name}.train') + assert os.path.exists(train_file), f'file {name} not found, please place the fasttext data in {data_path}' #' or specify the path' #todo + self.devel_raw, self.devel_target = load_fasttext_format(train_file) + self.test_raw, self.test_target = load_fasttext_format(join(data_path, f'{name}.test')) + self.devel_raw = mask_numbers(self.devel_raw) + self.test_raw = mask_numbers(self.test_raw) + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1)) + + def _load_wipo(self, classmode, classlevel): + assert classmode in {'singlelabel', 'multilabel'}, 'available class_mode are sl (single-label) or ml (multi-label)' + data_path = '../datasets/WIPO/wipo-gamma/en' + data_proc = '../datasets/WIPO-extracted' + + devel = fetch_WIPOgamma(subset='train', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract']) + test = fetch_WIPOgamma(subset='test', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract']) + + devel_data = [d.text for d in devel] + test_data = [d.text for d in test] + self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data) + + self.classification_type = classmode + if classmode== 'multilabel': + devel_target = [d.all_labels for d in devel] + test_target = [d.all_labels for d in test] + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target) + self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix + else: + devel_target = [d.main_label for d in devel] + test_target = [d.main_label for d in test] + # only for labels with at least one training document + class_id = {labelname:index for index,labelname in enumerate(sorted(set(devel_target)))} + devel_target = np.array([class_id[id] for id in devel_target]).astype(int) + test_target = np.array([class_id.get(id,None) for id in test_target]) + if None in test_target: + print(f'deleting {(test_target==None).sum()} test documents without valid categories') + keep_pos = test_target!=None + self.test_raw = (np.asarray(self.test_raw)[keep_pos]).tolist() + test_target = test_target[keep_pos] + test_target=test_target.astype(int) + self.devel_target, self.test_target = devel_target, test_target + self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1)) + + def vectorize(self): + if not hasattr(self, 'Xtr') or not hasattr(self, 'Xte'): + self.Xtr = self._vectorizer.transform(self.devel_raw) + self.Xte = self._vectorizer.transform(self.test_raw) + self.Xtr.sort_indices() + self.Xte.sort_indices() + return self.Xtr, self.Xte + + def analyzer(self): + return self._vectorizer.build_analyzer() + + @classmethod + def load(cls, dataset_name, pickle_path=None): + + if pickle_path: + if os.path.exists(pickle_path): + print(f'loading pickled dataset from {pickle_path}') + dataset = pickle.load(open(pickle_path, 'rb')) + else: + print(f'fetching dataset and dumping it into {pickle_path}') + dataset = Dataset(name=dataset_name) + print('vectorizing for faster processing') + dataset.vectorize() + print('dumping') + pickle.dump(dataset, open(pickle_path, 'wb', pickle.HIGHEST_PROTOCOL)) + else: + print(f'loading dataset {dataset_name}') + dataset = Dataset(name=dataset_name) + + print('[Done]') + return dataset + + +def _label_matrix(tr_target, te_target): + mlb = MultiLabelBinarizer(sparse_output=True) + ytr = mlb.fit_transform(tr_target) + yte = mlb.transform(te_target) + print(mlb.classes_) + return ytr, yte + + +def load_fasttext_format(path): + print(f'loading {path}') + labels,docs=[],[] + for line in tqdm(open(path, 'rt').readlines()): + space = line.strip().find(' ') + label = int(line[:space].replace('__label__',''))-1 + labels.append(label) + docs.append(line[space+1:]) + labels=np.asarray(labels,dtype=int) + return docs,labels + + +def mask_numbers(data, number_mask='numbermask'): + mask = re.compile(r'\b[0-9][0-9.,-]*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + masked.append(mask.sub(number_mask, text)) + return masked + + diff --git a/MultiLabel/data/jrcacquis_reader.py b/MultiLabel/data/jrcacquis_reader.py new file mode 100755 index 0000000..28d753a --- /dev/null +++ b/MultiLabel/data/jrcacquis_reader.py @@ -0,0 +1,263 @@ +import os, sys +from os.path import join +import tarfile +import xml.etree.ElementTree as ET +from sklearn.datasets import get_data_home +import pickle +import rdflib +from rdflib.namespace import RDF, SKOS +from rdflib import URIRef +import zipfile +from collections import Counter +from tqdm import tqdm +from random import shuffle +from util.file import * + + +class JRCAcquis_Document: + def __init__(self, id, name, lang, year, head, body, categories): + self.id = id + self.parallel_id = name + self.lang = lang + self.year = year + self.text = body if not head else head + "\n" + body + self.categories = categories + + @classmethod + def get_text(cls, jrc_documents): + return [d.text for d in jrc_documents] + + @classmethod + def get_target(cls, jrc_documents): + return [d.categories for d in jrc_documents] + + +# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles +# however, it seems that the title is often appearing as the first paragraph in the text/body (with +# standard codification), so it might be preferable not to read the header after all (as here by default) +def _proc_acute(text): + for ch in ['a','e','i','o','u']: + text = text.replace('%'+ch+'acute%',ch) + return text + +def parse_document(file, year, head=False): + root = ET.parse(file).getroot() + + doc_name = root.attrib['n'] # e.g., '22006A0211(01)' + doc_lang = root.attrib['lang'] # e.g., 'es' + doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' + doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] + doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' + doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) + + def raise_if_empty(field, from_file): + if isinstance(field, str): + if not field.strip(): + raise ValueError("Empty field in file %s" % from_file) + + raise_if_empty(doc_name, file) + raise_if_empty(doc_lang, file) + raise_if_empty(doc_id, file) + if head: raise_if_empty(doc_head, file) + raise_if_empty(doc_body, file) + + return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) + +#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter +def _filter_by_category(doclist, cat_filter): + if not isinstance(cat_filter, frozenset): + cat_filter = frozenset(cat_filter) + filtered = [] + for doc in doclist: + doc.categories = list(cat_filter & set(doc.categories)) + if doc.categories: + doc.categories.sort() + filtered.append(doc) + print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) + return filtered + +#filters out categories with less than cat_threshold documents (and filters documents containing those categories) +def _filter_by_frequency(doclist, cat_threshold): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +#select top most_frequent categories (and filters documents containing those categories) +def _most_common(doclist, most_frequent): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +def _get_categories(request): + final_cats = set() + for d in request: + final_cats.update(d.categories) + return list(final_cats) + +def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True, + cat_filter=None, cat_threshold=0, most_frequent=-1, + DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): + + if not data_path: + data_path = get_data_home() + + if not os.path.exists(data_path): + os.mkdir(data_path) + + request = [] + total_read = 0 + file_name = 'jrc-' + lang + '.tgz' + archive_path = join(data_path, file_name) + + if not os.path.exists(archive_path): + print("downloading language-specific dataset (once and for all) into %s" % data_path) + DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) + download_file(DOWNLOAD_URL, archive_path) + print("untarring dataset...") + tarfile.open(archive_path, 'r:gz').extractall(data_path) + + documents_dir = join(data_path, lang) + + print("Reading documents...") + read = 0 + for dir in list_dirs(documents_dir): + year = int(dir) + if years==None or year in years: + year_dir = join(documents_dir,dir) + l_y_documents = [] + all_documents = list_files(year_dir) + empty = 0 + pbar = tqdm(enumerate(all_documents)) + for i,doc_file in pbar: + try: + jrc_doc = parse_document(join(year_dir, doc_file), year) + except ValueError: + jrc_doc = None + + if jrc_doc and (not ignore_unclassified or jrc_doc.categories): + l_y_documents.append(jrc_doc) + else: empty += 1 + read+=1 + pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields') + request += l_y_documents + print("Read %d documents for language %s\n" % (read, lang)) + total_read += read + + final_cats = _get_categories(request) + + if cat_filter: + request = _filter_by_category(request, cat_filter) + final_cats = _get_categories(request) + if cat_threshold > 0: + request, final_cats = _filter_by_frequency(request, cat_threshold) + if most_frequent != -1 and len(final_cats) > most_frequent: + request, final_cats = _most_common(request, most_frequent) + + return request, final_cats + +def print_cat_analysis(request): + cat_count = Counter() + for d in request: + cat_count.update(d.categories) + print("Number of active categories: {}".format(len(cat_count))) + print(cat_count.most_common()) + +# inspects the Eurovoc thesaurus in order to select a subset of categories +# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented +def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', + eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", + select="broadest"): + + fullpath_pickle = join(data_path, select+'_concepts.pickle') + if os.path.exists(fullpath_pickle): + print("Pickled object found in %s. Loading it." % fullpath_pickle) + return pickle.load(open(fullpath_pickle,'rb')) + + fullpath = join(data_path, eurovoc_skos_core_concepts_filename) + if not os.path.exists(fullpath): + print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) + download_file(eurovoc_url, fullpath) + print("Unzipping file...") + zipped = zipfile.ZipFile(data_path + '.zip', 'r') + zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) + zipped.close() + + print("Parsing %s" %fullpath) + g = rdflib.Graph() + g.parse(location=fullpath, format="application/rdf+xml") + + if select == "all": + print("Selecting all concepts") + all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) + all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] + all_concepts.sort() + selected_concepts = all_concepts + elif select=="broadest": + print("Selecting broadest concepts (those without any other broader concept linked to it)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + narrower_concepts = set(g.subjects(SKOS.broader, None)) + broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] + broadest_concepts.sort() + selected_concepts = broadest_concepts + elif select=="leaves": + print("Selecting leaves concepts (those not linked as broader of any other concept)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + broad_concepts = set(g.objects(None, SKOS.broader)) + leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] + leave_concepts.sort() + selected_concepts = leave_concepts + else: + raise ValueError("Selection policy %s is not currently supported" % select) + + print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) + print("Pickling concept list for faster further requests in %s" % fullpath_pickle) + pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) + + return selected_concepts + + + +if __name__ == '__main__': + + # example code + + train_years = list(range(1986, 2006)) + test_years = [2006] + cat_policy = 'all' #'leaves' + most_common_cat = 300 + JRC_DATAPATH = "../datasets/JRC_Acquis_v3" + cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) + + training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years, + cat_filter=None, cat_threshold=1, + most_frequent=most_common_cat) + test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years, + cat_filter=tr_cats, cat_threshold=1) + # training_cats = jrc_get_categories(training_docs) + # test_cats = jrc_get_categories(test_docs) + # intersection_cats = [c for c in training_cats if c in test_cats] + + # training_docs = jrc_filter_by_category(training_docs, intersection_cats) + # test_docs = jrc_filter_by_category(test_docs, intersection_cats) + + + print(f'JRC-train: {len(training_docs)} documents') + print(f'JRC-test: {len(test_docs)} documents') + + print_cat_analysis(training_docs) + print_cat_analysis(test_docs) + + """ + JRC-train: 12615 documents, 300 cats + JRC-test: 7055 documents, 300 cats + """ + + diff --git a/MultiLabel/data/labeled.py b/MultiLabel/data/labeled.py new file mode 100755 index 0000000..a89b93d --- /dev/null +++ b/MultiLabel/data/labeled.py @@ -0,0 +1,5 @@ +class LabelledDocuments: + def __init__(self, data, target, target_names): + self.data=data + self.target=target + self.target_names=target_names \ No newline at end of file diff --git a/MultiLabel/data/ohsumed_reader.py b/MultiLabel/data/ohsumed_reader.py new file mode 100755 index 0000000..8547482 --- /dev/null +++ b/MultiLabel/data/ohsumed_reader.py @@ -0,0 +1,63 @@ +import os +import pickle +import tarfile +from os.path import join +import urllib.request +from data.labeled import LabelledDocuments +from util.file import create_if_not_exist, download_file_if_not_exists +import math + + +def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7): + _dataname = 'ohsumed50k' + if data_path is None: + data_path = join(os.path.expanduser('~'), _dataname) + create_if_not_exist(data_path) + + pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle') + if not os.path.exists(pickle_file): + DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz') + archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz') + download_file_if_not_exists(DOWNLOAD_URL, archive_path) + untardir = 'ohsumed-all' + if not os.path.exists(os.path.join(data_path, untardir)): + print("untarring ohsumed...") + tarfile.open(archive_path, 'r:gz').extractall(data_path) + + target_names = [] + doc_classes = dict() + class_docs = dict() + content = dict() + doc_ids = set() + for cat_id in os.listdir(join(data_path, untardir)): + target_names.append(cat_id) + class_docs[cat_id] = [] + for doc_id in os.listdir(join(data_path, untardir, cat_id)): + doc_ids.add(doc_id) + text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read() + if doc_id not in doc_classes: doc_classes[doc_id] = [] + doc_classes[doc_id].append(cat_id) + if doc_id not in content: content[doc_id] = text_content + class_docs[cat_id].append(doc_id) + target_names.sort() + print('Read %d different documents' % len(doc_ids)) + + splitdata = dict({'train': [], 'test': []}) + for cat_id in target_names: + free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])] + if len(free_docs) > 0: + split_point = int(math.floor(len(free_docs) * train_test_split)) + splitdata['train'].extend(free_docs[:split_point]) + splitdata['test'].extend(free_docs[split_point:]) + for split in ['train', 'test']: + dataset = LabelledDocuments([], [], target_names) + for doc_id in splitdata[split]: + dataset.data.append(content[doc_id]) + dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]]) + pickle.dump(dataset, + open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'), + protocol=pickle.HIGHEST_PROTOCOL) + + print(pickle_file) + return pickle.load(open(pickle_file, 'rb')) + diff --git a/MultiLabel/data/rcv_reader.py b/MultiLabel/data/rcv_reader.py new file mode 100755 index 0000000..f19b981 --- /dev/null +++ b/MultiLabel/data/rcv_reader.py @@ -0,0 +1,152 @@ +from zipfile import ZipFile +import xml.etree.ElementTree as ET +from data.labeled import LabelledDocuments +from util.file import list_files +from os.path import join, exists +from util.file import download_file_if_not_exists +import re +from collections import Counter + +RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" +RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" + +rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', + 'lyrl2004_tokens_test_pt1.dat.gz', + 'lyrl2004_tokens_test_pt2.dat.gz', + 'lyrl2004_tokens_test_pt3.dat.gz'] + +rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] + +rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' + +class RCV_Document: + def __init__(self, id, text, categories, date=''): + self.id = id + self.date = date + self.text = text + self.categories = categories + +class IDRangeException(Exception): pass + +nwords = [] + +def parse_document(xml_content, valid_id_range=None): + root = ET.fromstring(xml_content) + + doc_id = root.attrib['itemid'] + if valid_id_range is not None: + if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: + raise IDRangeException + + doc_categories = [cat.attrib['code'] for cat in + root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] + + doc_date = root.attrib['date'] + doc_title = root.find('.//title').text + doc_headline = root.find('.//headline').text + doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) + + if not doc_body: + raise ValueError('Empty document') + + if doc_title is None: doc_title = '' + if doc_headline is None or doc_headline in doc_title: doc_headline = '' + text = '\n'.join([doc_title, doc_headline, doc_body]).strip() + + return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date) + + +def fetch_RCV1(data_path, subset='all'): + + assert subset in ['train', 'test', 'all'], 'split should either be "train", "test", or "all"' + + request = [] + labels = set() + read_documents = 0 + + training_documents = 23149 + test_documents = 781265 + + if subset == 'all': + split_range = (2286, 810596) + expected = training_documents+test_documents + elif subset == 'train': + split_range = (2286, 26150) + expected = training_documents + else: + split_range = (26151, 810596) + expected = test_documents + + # global nwords + # nwords=[] + for part in list_files(data_path): + if not re.match('\d+\.zip', part): continue + target_file = join(data_path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, valid_id_range=split_range) + labels.update(doc.categories) + request.append(doc) + read_documents += 1 + except (IDRangeException,ValueError) as e: + pass + print('\r[{}] read {} documents'.format(part, len(request)), end='') + if read_documents == expected: break + if read_documents == expected: break + + print() + # print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + + return LabelledDocuments(data=[d.text for d in request], target=[d.categories for d in request], target_names=list(labels)) + + + +def fetch_topic_hierarchy(path, topics='all'): + assert topics in ['all', 'leaves'] + + download_file_if_not_exists(RCV1_TOPICHIER_URL, path) + hierarchy = {} + for line in open(path, 'rt'): + parts = line.strip().split() + parent,child = parts[1],parts[3] + if parent not in hierarchy: + hierarchy[parent]=[] + hierarchy[parent].append(child) + + del hierarchy['None'] + del hierarchy['Root'] + print(hierarchy) + + if topics=='all': + topics = set(hierarchy.keys()) + for parent in hierarchy.keys(): + topics.update(hierarchy[parent]) + return list(topics) + elif topics=='leaves': + parents = set(hierarchy.keys()) + childs = set() + for parent in hierarchy.keys(): + childs.update(hierarchy[parent]) + return list(childs.difference(parents)) + + +if __name__=='__main__': + + # example + + RCV1_PATH = '../../datasets/RCV1-v2/unprocessed_corpus' + + rcv1_train = fetch_RCV1(RCV1_PATH, subset='train') + rcv1_test = fetch_RCV1(RCV1_PATH, subset='test') + + print('read {} documents in rcv1-train, and {} labels'.format(len(rcv1_train.data), len(rcv1_train.target_names))) + print('read {} documents in rcv1-test, and {} labels'.format(len(rcv1_test.data), len(rcv1_test.target_names))) + + cats = Counter() + for cats in rcv1_train.target: cats.update(cats) + print('RCV1', cats) diff --git a/MultiLabel/data/reuters21578_reader.py b/MultiLabel/data/reuters21578_reader.py new file mode 100755 index 0000000..1197965 --- /dev/null +++ b/MultiLabel/data/reuters21578_reader.py @@ -0,0 +1,189 @@ +# Modified version of the code originally implemented by Eustache Diemert +# @FedericoV +# with License: BSD 3 clause + +import os.path +import re +import tarfile +from sklearn.datasets import get_data_home +from six.moves import html_parser +from six.moves import urllib +import pickle +from glob import glob +import numpy as np +from data.labeled import LabelledDocuments + + +def _not_in_sphinx(): + # Hack to detect whether we are running by the sphinx builder + return '__file__' in globals() + + +class ReutersParser(html_parser.HTMLParser): + """Utility class to parse a SGML file and yield documents one at a time.""" + + def __init__(self, encoding='latin-1', data_path=None): + self.data_path = data_path + self.download_if_not_exist() + self.tr_docs = [] + self.te_docs = [] + html_parser.HTMLParser.__init__(self) + self._reset() + self.encoding = encoding + self.empty_docs = 0 + + def handle_starttag(self, tag, attrs): + method = 'start_' + tag + getattr(self, method, lambda x: None)(attrs) + + def handle_endtag(self, tag): + method = 'end_' + tag + getattr(self, method, lambda: None)() + + def _reset(self): + self.in_title = 0 + self.in_body = 0 + self.in_topics = 0 + self.in_topic_d = 0 + self.in_unproc_text = 0 + self.title = "" + self.body = "" + self.topics = [] + self.topic_d = "" + self.text = "" + + def parse(self, fd): + for chunk in fd: + self.feed(chunk.decode(self.encoding)) + self.close() + + def handle_data(self, data): + if self.in_body: + self.body += data + elif self.in_title: + self.title += data + elif self.in_topic_d: + self.topic_d += data + elif self.in_unproc_text: + self.text += data + + def start_reuters(self, attributes): + topic_attr = attributes[0][1] + lewissplit_attr = attributes[1][1] + self.lewissplit = u'unused' + if topic_attr==u'YES': + if lewissplit_attr == u'TRAIN': + self.lewissplit = 'train' + elif lewissplit_attr == u'TEST': + self.lewissplit = 'test' + pass + + def end_reuters(self): + self.body = re.sub(r'\s+', r' ', self.body) + if self.lewissplit != u'unused': + parsed_doc = {'title': self.title, 'body': self.body, 'unproc':self.text, 'topics': self.topics} + if (self.title+self.body+self.text).strip() == '': + self.empty_docs += 1 + if self.lewissplit == u'train': + self.tr_docs.append(parsed_doc) + elif self.lewissplit == u'test': + self.te_docs.append(parsed_doc) + self._reset() + + def start_title(self, attributes): + self.in_title = 1 + + def end_title(self): + self.in_title = 0 + + def start_body(self, attributes): + self.in_body = 1 + + def end_body(self): + self.in_body = 0 + + def start_topics(self, attributes): + self.in_topics = 1 + + def end_topics(self): + self.in_topics = 0 + + def start_text(self, attributes): + if len(attributes)>0 and attributes[0][1] == u'UNPROC': + self.in_unproc_text = 1 + + def end_text(self): + self.in_unproc_text = 0 + + def start_d(self, attributes): + self.in_topic_d = 1 + + def end_d(self): + if self.in_topics: + self.topics.append(self.topic_d) + self.in_topic_d = 0 + self.topic_d = "" + + def download_if_not_exist(self): + DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' + 'reuters21578-mld/reuters21578.tar.gz') + ARCHIVE_FILENAME = 'reuters21578.tar.gz' + + if self.data_path is None: + self.data_path = os.path.join(get_data_home(), "reuters") + if not os.path.exists(self.data_path): + """Download the dataset.""" + print("downloading dataset (once and for all) into %s" % self.data_path) + os.mkdir(self.data_path) + + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + if _not_in_sphinx(): + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + + archive_path = os.path.join(self.data_path, ARCHIVE_FILENAME) + urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path, + reporthook=progress) + if _not_in_sphinx(): + print('\r', end='') + print("untarring Reuters dataset...") + tarfile.open(archive_path, 'r:gz').extractall(self.data_path) + print("done.") + + +def fetch_reuters21578(data_path=None, subset='train'): + if data_path is None: + data_path = os.path.join(get_data_home(), 'reuters21578') + reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle") + if not os.path.exists(reuters_pickle_path): + parser = ReutersParser(data_path=data_path) + for filename in glob(os.path.join(data_path, "*.sgm")): + parser.parse(open(filename, 'rb')) + # index category names with a unique numerical code (only considering categories with training examples) + tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist() + + def pickle_documents(docs, subset): + for doc in docs: + doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories] + pickle_docs = {'categories': tr_categories, 'documents': docs} + pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'), + protocol=pickle.HIGHEST_PROTOCOL) + return pickle_docs + + pickle_tr = pickle_documents(parser.tr_docs, "train") + pickle_te = pickle_documents(parser.te_docs, "test") + # self.sout('Empty docs %d' % parser.empty_docs) + requested_subset = pickle_tr if subset == 'train' else pickle_te + else: + requested_subset = pickle.load(open(reuters_pickle_path, 'rb')) + + data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']] + text_data, topics = zip(*data) + return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories']) + + + +if __name__=='__main__': + reuters_train = fetch_reuters21578(subset='train') + print(reuters_train.data) \ No newline at end of file diff --git a/MultiLabel/data/tsr_function__.py b/MultiLabel/data/tsr_function__.py new file mode 100755 index 0000000..9f827ca --- /dev/null +++ b/MultiLabel/data/tsr_function__.py @@ -0,0 +1,280 @@ +import math +import numpy as np +from scipy.stats import t +from scipy.stats import norm +from joblib import Parallel, delayed +import time +from scipy.sparse import csr_matrix, csc_matrix + + +STWFUNCTIONS = ['dotn', 'ppmi', 'ig', 'chi2', 'cw', 'wp'] + + +def get_probs(tpr, fpr, pc): + # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) + # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) + pnc = 1.0 - pc + tp = tpr * pc + fn = pc - tp + fp = fpr * pnc + tn = pnc - fp + return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) + + +def apply_tsr(tpr, fpr, pc, tsr): + cell = get_probs(tpr, fpr, pc) + return tsr(cell) + + +def positive_information_gain(cell): + if cell.tpr() < cell.fpr(): + return 0.0 + else: + return information_gain(cell) + + +def posneg_information_gain(cell): + ig = information_gain(cell) + if cell.tpr() < cell.fpr(): + return -ig + else: + return ig + + +def __ig_factor(p_tc, p_t, p_c): + den = p_t * p_c + if den != 0.0 and p_tc != 0: + return p_tc * math.log(p_tc / den, 2) + else: + return 0.0 + + +def information_gain(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ + __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) + + +def information_gain_mod(cell): + return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ + - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) + + +def pointwise_mutual_information(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + + +def gain_ratio(cell): + pc = cell.p_c() + pnc = 1.0 - pc + norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) + return information_gain(cell) / (-norm) + + +def chi_square(cell): + den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() + if den==0.0: return 0.0 + num = gss(cell)**2 + return num / den + + +def relevance_frequency(cell): + a = cell.tp + c = cell.fp + if c == 0: c = 1 + return math.log(2.0 + (a * 1.0 / c), 2) + + +def idf(cell): + if cell.p_f()>0: + return math.log(1.0 / cell.p_f()) + return 0.0 + + +def gss(cell): + return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() + + +def conf_interval(xt, n): + if n>30: + z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 + else: + z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 + p = (xt + 0.5 * z2) / (n + z2) + amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) + return p, amplitude + + +def strength(minPosRelFreq, minPos, maxNeg): + if minPos > maxNeg: + return math.log(2.0 * minPosRelFreq, 2.0) + else: + return 0.0 + + +#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) +#however, for some extremely imbalanced dataset caused all documents to be 0 +def conf_weight(cell, cancel_features=False): + c = cell.get_c() + not_c = cell.get_not_c() + tp = cell.tp + fp = cell.fp + + pos_p, pos_amp = conf_interval(tp, c) + neg_p, neg_amp = conf_interval(fp, not_c) + + min_pos = pos_p-pos_amp + max_neg = neg_p+neg_amp + den = (min_pos + max_neg) + minpos_relfreq = min_pos / (den if den != 0 else 1) + + str_tplus = strength(minpos_relfreq, min_pos, max_neg); + + if str_tplus == 0 and not cancel_features: + return 1e-20 + + return str_tplus; + + +def word_prob(cell): + return cell.tpr() + + +class ContTable: + + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp=tp + self.tn=tn + self.fp=fp + self.fn=fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + +def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): + print(f'[selectiong {k} terms]') + nC = Y.shape[1] + FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T + best_features_idx = np.argsort(-FC, axis=0).flatten() + tsr_values = FC.flatten() + selected_indexes_set = set() + selected_indexes = list() + selected_value = list() + from_category = list() + round_robin = iter(best_features_idx) + values_iter = iter(tsr_values) + round=0 + while len(selected_indexes) < k: + term_idx = next(round_robin) + term_val = next(values_iter) + if term_idx not in selected_indexes_set: + selected_indexes_set.add(term_idx) + selected_indexes.append(term_idx) + selected_value.append(term_val) + from_category.append(round) + round = (round + 1) % nC + return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) + + +def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): + tp_ = len(positive_document_indexes & feature_document_indexes) + fp_ = len(feature_document_indexes - positive_document_indexes) + fn_ = len(positive_document_indexes - feature_document_indexes) + tn_ = nD - (tp_ + fp_ + fn_) + return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) + + +def category_tables(feature_sets, category_sets, c, nD, nF): + return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] + + +""" +Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. +Efficiency O(nF x nC x log(S)) where S is the sparse factor +""" +def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): + nD, nF = coocurrence_matrix.shape + nD2, nC = label_matrix.shape + + if nD != nD2: + raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % + (coocurrence_matrix.shape,label_matrix.shape)) + + def nonzero_set(matrix, col): + return set(matrix[:, col].nonzero()[0]) + + if isinstance(coocurrence_matrix, csr_matrix): + coocurrence_matrix = csc_matrix(coocurrence_matrix) + feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] + category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] + cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) + return np.array(cell_matrix) + +# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f +def get_tsr_matrix(cell_matrix, tsr_score_funtion): + nC,nF = cell_matrix.shape + tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] + return np.array(tsr_matrix) + + +""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can +take as input any real-valued feature column (e.g., tf-idf weights). +feat is the feature vector, and c is a binary classification vector. +This implementation covers only the binary case, while the formula is defined for multiclass +single-label scenarios, for which the version [2] might be preferred. +[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. +[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. +""" +def fisher_score_binary(feat, c): + neg = np.ones_like(c) - c + + npos = np.sum(c) + nneg = np.sum(neg) + + mupos = np.mean(feat[c == 1]) + muneg = np.mean(feat[neg == 1]) + mu = np.mean(feat) + + stdpos = np.std(feat[c == 1]) + stdneg = np.std(feat[neg == 1]) + + num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) + den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) + + if den>0: + return num / den + else: + return num diff --git a/MultiLabel/data/wipo_reader.py b/MultiLabel/data/wipo_reader.py new file mode 100755 index 0000000..2867da4 --- /dev/null +++ b/MultiLabel/data/wipo_reader.py @@ -0,0 +1,212 @@ +#https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/ +import os, sys +from os.path import exists, join +from util.file import * +from zipfile import ZipFile +import xml.etree.ElementTree as ET +from tqdm import tqdm +import numpy as np +import pickle +from joblib import Parallel, delayed + +WIPO_URL= 'https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/' + + +class WipoGammaDocument: + def __init__(self, id, text, main_label, all_labels): + self.id = id + self.text = text + self.main_label = main_label + self.all_labels = all_labels + + +def remove_nested_claimtext_tags(xmlcontent): + from_pos = xmlcontent.find(b'') + if from_pos > -1 and to_pos > -1: + in_between = xmlcontent[from_pos:to_pos].replace(b'',b'').replace(b'',b'') + xmlcontent = (xmlcontent[:from_pos]+in_between+xmlcontent[to_pos:]).strip() + return xmlcontent + + +def parse_document(xml_content, text_fields, limit_description): + root = ET.fromstring(remove_nested_claimtext_tags(xml_content)) + + doc_id = root.attrib['ucid'] + lang = root.attrib['lang'] + + #take categories from the categorization up the "sub-class" level + main_group = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="true"]')) + sec_groups = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="false"]')) + sec_groups.update(main_group) + + assert len(main_group) == 1, 'more than one main groups' + main_group = list(main_group)[0] + sec_groups = sorted(list(sec_groups)) + + assert lang == 'EN', f'only English documents allowed (doc {doc_id})' + + doc_text_fields=[] + if 'abstract' in text_fields: + abstract = '\n'.join(filter(None, [t.text for t in root.findall('.//abstract[@lang="EN"]/p')])) + doc_text_fields.append(abstract) + if 'description' in text_fields: + description = '\n'.join(filter(None, [t.text for t in root.findall('.//description[@lang="EN"]/p')])) + if limit_description>-1: + description=' '.join(description.split()[:limit_description]) + doc_text_fields.append(description) + if 'claims' in text_fields: + claims = '\n'.join(filter(None, [t.text for t in root.findall('.//claims[@lang="EN"]/claim')])) + doc_text_fields.append(claims) + + text = '\n'.join(doc_text_fields) + if text: + return WipoGammaDocument(doc_id, text, main_group, sec_groups) + else: + return None + + +def extract(fin, fout, text_fields, limit_description): + zipfile = ZipFile(fin) + ndocs=0 + with open(fout, 'wt') as out: + for xmlfile in tqdm(zipfile.namelist()): + if xmlfile.endswith('.xml'): + xmlcontent = zipfile.open(xmlfile).read() + document = parse_document(xmlcontent, text_fields, limit_description) + if document: + line_text = document.text.replace('\n', ' ').replace('\t', ' ').strip() + assert line_text, f'empty document in {xmlfile}' + all_labels = ' '.join(document.all_labels) + out.write('\t'.join([document.id, document.main_label, all_labels, line_text])) + out.write('\n') + ndocs+=1 + out.flush() + + + +def read_classification_file(data_path, classification_level): + assert classification_level in ['subclass', 'maingroup'], 'wrong classification requested' + z = ZipFile(join(data_path,'EnglishWipoGamma1.zip')) + inpath='Wipo_Gamma/English/TrainTestSpits' + document_labels = dict() + train_ids, test_ids = set(), set() + labelcut = LabelCut(classification_level) + for subset in tqdm(['train', 'test'], desc='loading classification file'): + target_subset = train_ids if subset=='train' else test_ids + if classification_level == 'subclass': + file = f'{subset}set_en_sc.parts' #sub-class level + else: + file = f'{subset}set_en_mg.parts' #main-group level + + for line in z.open(f'{inpath}/{file}').readlines(): + line = line.decode().strip().split(',') + id = line[0] + id = id[id.rfind('/')+1:].replace('.xml','') + labels = labelcut.trim(line[1:]) + document_labels[id]=labels + target_subset.add(id) + + return document_labels, train_ids, test_ids + + +class LabelCut: + """ + Labels consists of 1 char for section, 2 chars for class, 1 class for subclass, 2 chars for maingroup and so on. + This class cuts the label at a desired level (4 for subclass, or 6 for maingroup) + """ + def __init__(self, classification_level): + assert classification_level in {'subclass','maingroup'}, 'unknown classification level' + if classification_level == 'subclass': self.cut = 4 + else: self.cut = 6 + + def trim(self, label): + if isinstance(label, list): + return sorted(set([l[:self.cut] for l in label])) + else: + return label[:self.cut] + + + +def fetch_WIPOgamma(subset, classification_level, data_home, extracted_path, text_fields = ['abstract', 'description'], limit_description=300): + """ + Fetchs the WIPO-gamma dataset + :param subset: 'train' or 'test' split + :param classification_level: the classification level, either 'subclass' or 'maingroup' + :param data_home: directory containing the original 11 English zips + :param extracted_path: directory used to extract and process the original files + :param text_fields: indicates the fields to extract, in 'abstract', 'description', 'claims' + :param limit_description: the maximum number of words to take from the description field (default 300); set to -1 for all + :return: + """ + assert subset in {"train", "test"}, 'unknown target request (valid ones are "train" or "test")' + assert len(text_fields)>0, 'at least some text field should be indicated' + if not exists(data_home): + raise ValueError(f'{data_home} does not exist, and the dataset cannot be automatically download, ' + f'since you need to request for permission. Please refer to {WIPO_URL}') + + create_if_not_exist(extracted_path) + config = f'{"-".join(text_fields)}' + if 'description' in text_fields: config+='-{limit_description}' + pickle_path=join(extracted_path, f'wipo-{subset}-{classification_level}-{config}.pickle') + if exists(pickle_path): + print(f'loading pickled file in {pickle_path}') + return pickle.load(open(pickle_path,'rb')) + + print('pickle file not found, processing...(this will take some minutes)') + extracted = sum([exists(f'{extracted_path}/EnglishWipoGamma{(i+1)}-{config}.txt') for i in range(11)])==11 + if not extracted: + print(f'extraction files not found, extracting files in {data_home}... (this will take some additional minutes)') + Parallel(n_jobs=-1)( + delayed(extract)( + join(data_home, file), join(extracted_path, file.replace('.zip', f'-{config}.txt')), text_fields, limit_description + ) + for file in list_files(data_home) + ) + doc_labels, train_ids, test_ids = read_classification_file(data_home, classification_level=classification_level) # or maingroup + print(f'{len(doc_labels)} documents classified split in {len(train_ids)} train and {len(test_ids)} test documents') + + train_request = [] + test_request = [] + pbar = tqdm([filename for filename in list_files(extracted_path) if filename.endswith(f'-{config}.txt')]) + labelcut = LabelCut(classification_level) + errors=0 + for proc_file in pbar: + pbar.set_description(f'processing {proc_file} [errors={errors}]') + if not proc_file.endswith(f'-{config}.txt'): continue + lines = open(f'{extracted_path}/{proc_file}', 'rt').readlines() + for lineno,line in enumerate(lines): + parts = line.split('\t') + assert len(parts)==4, f'wrong format in {extracted_path}/{proc_file} line {lineno}' + id,mainlabel,alllabels,text=parts + mainlabel = labelcut.trim(mainlabel) + alllabels = labelcut.trim(alllabels.split()) + + # assert id in train_ids or id in test_ids, f'id {id} out of scope' + if id not in train_ids and id not in test_ids: + errors+=1 + else: + # assert mainlabel == doc_labels[id][0], 'main label not consistent' + request = train_request if id in train_ids else test_request + request.append(WipoGammaDocument(id, text, mainlabel, alllabels)) + + print('pickling requests for faster subsequent runs') + pickle.dump(train_request, open(join(extracted_path,f'wipo-train-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL)) + pickle.dump(test_request, open(join(extracted_path, f'wipo-test-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL)) + + if subset== 'train': + return train_request + else: + return test_request + + +if __name__=='__main__': + data_home = '../../datasets/WIPO/wipo-gamma/en' + extracted_path = '../../datasets/WIPO-extracted' + + train = fetch_WIPOgamma(subset='train', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract')) + test = fetch_WIPOgamma(subset='test', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract')) + # train = fetch_WIPOgamma(subset='train', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path) + # test = fetch_WIPOgamma(subset='test', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path) + + print('Done') diff --git a/MultiLabel/multi_label.py b/MultiLabel/multi_label.py new file mode 100644 index 0000000..ae76814 --- /dev/null +++ b/MultiLabel/multi_label.py @@ -0,0 +1,334 @@ +from copy import deepcopy + +from sklearn.calibration import CalibratedClassifierCV +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, MultiTaskLassoCV, LassoLars, LassoLarsCV, \ + ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor +from sklearn.metrics import f1_score +from sklearn.multiclass import OneVsRestClassifier +from sklearn.multioutput import MultiOutputRegressor +from sklearn.svm import LinearSVC +from tqdm import tqdm + +import quapy as qp +from functional import artificial_prevalence_sampling +from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy +from method.base import BaseQuantifier +from quapy.data import from_rcv2_lang_file, LabelledCollection +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler +import numpy as np +from data.dataset import Dataset + + + + +def cls(): + # return LinearSVC() + return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) + + +def calibratedCls(): + return CalibratedClassifierCV(cls()) + + +class MultilabelledCollection: + def __init__(self, instances, labels): + assert labels.ndim==2, 'data does not seem to be multilabel' + self.instances = instances + self.labels = labels + self.classes_ = np.arange(labels.shape[1]) + + @classmethod + def load(cls, path: str, loader_func: callable): + return MultilabelledCollection(*loader_func(path)) + + def __len__(self): + return self.instances.shape[0] + + def prevalence(self): + # return self.labels.mean(axis=0) + pos = self.labels.mean(axis=0) + neg = 1-pos + return np.asarray([neg, pos]).T + + def counts(self): + return self.labels.sum(axis=0) + + @property + def n_classes(self): + return len(self.classes_) + + @property + def binary(self): + return False + + def __gen_index(self): + return np.arange(len(self)) + + def sampling_multi_index(self, size, cat, prev=None): + if prev is None: # no prevalence was indicated; returns an index for uniform sampling + return np.random.choice(len(self), size, replace=size>len(self)) + aux = LabelledCollection(self.__gen_index(), self.labels[:,cat]) + return aux.sampling_index(size, *[1-prev, prev]) + + def uniform_sampling_multi_index(self, size): + return np.random.choice(len(self), size, replace=size>len(self)) + + def uniform_sampling(self, size): + unif_index = self.uniform_sampling_multi_index(size) + return self.sampling_from_index(unif_index) + + def sampling(self, size, category, prev=None): + prev_index = self.sampling_multi_index(size, category, prev) + return self.sampling_from_index(prev_index) + + def sampling_from_index(self, index): + documents = self.instances[index] + labels = self.labels[index, :] + return MultilabelledCollection(documents, labels) + + def train_test_split(self, train_prop=0.6, random_state=None): + tr_docs, te_docs, tr_labels, te_labels = \ + train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state) + return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels) + + def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1): + dimensions = 2 + for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten(): + yield self.sampling(sample_size, category, prevs) + + def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1): + dimensions = 2 + for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten(): + yield self.sampling_multi_index(sample_size, category, prevs) + + def natural_sampling_generator(self, sample_size, repeats=100): + for _ in range(repeats): + yield self.uniform_sampling(sample_size) + + def natural_sampling_index_generator(self, sample_size, repeats=100): + for _ in range(repeats): + yield self.uniform_sampling_multi_index(sample_size) + + def asLabelledCollection(self, category): + return LabelledCollection(self.instances, self.labels[:,category]) + + def genLabelledCollections(self): + for c in self.classes_: + yield self.asLabelledCollection(c) + + @property + def Xy(self): + return self.instances, self.labels + + +class MultilabelClassifier: # aka Funnelling Monolingual + def __init__(self, base_estimator=LogisticRegression()): + if not hasattr(base_estimator, 'predict_proba'): + print('the estimator does not seem to be probabilistic: calibrating') + base_estimator = CalibratedClassifierCV(base_estimator) + self.base = deepcopy(OneVsRestClassifier(base_estimator)) + self.meta = deepcopy(OneVsRestClassifier(base_estimator)) + self.norm = StandardScaler() + + def fit(self, X, y): + assert y.ndim==2, 'the dataset does not seem to be multi-label' + self.base.fit(X, y) + P = self.base.predict_proba(X) + P = self.norm.fit_transform(P) + self.meta.fit(P, y) + return self + + def predict(self, X): + P = self.base.predict_proba(X) + P = self.norm.transform(P) + return self.meta.predict(P) + + def predict_proba(self, X): + P = self.base.predict_proba(X) + P = self.norm.transform(P) + return self.meta.predict_proba(P) + +class MLCC: + def __init__(self, mlcls:MultilabelClassifier): + self.mlcls = mlcls + + def fit(self, data:MultilabelledCollection): + self.mlcls.fit(*data.Xy) + + def quantify(self, instances): + pred = self.mlcls.predict(instances) + pos_prev = pred.mean(axis=0) + neg_prev = 1-pos_prev + return np.asarray([neg_prev, pos_prev]).T + + +class MLPCC: + def __init__(self, mlcls: MultilabelClassifier): + self.mlcls = mlcls + + def fit(self, data: MultilabelledCollection): + self.mlcls.fit(*data.Xy) + + def quantify(self, instances): + pred = self.mlcls.predict_proba(instances) + pos_prev = pred.mean(axis=0) + neg_prev = 1 - pos_prev + return np.asarray([neg_prev, pos_prev]).T + + +class MultilabelQuantifier: + def __init__(self, q:BaseQuantifier, n_jobs=-1): + self.q = q + self.estimators = None + self.n_jobs = n_jobs + + def fit(self, data:MultilabelledCollection): + self.classes_ = data.classes_ + + def cat_job(lc): + return deepcopy(self.q).fit(lc) + + self.estimators = qp.util.parallel(cat_job, data.genLabelledCollections(), n_jobs=self.n_jobs) + return self + + def quantify(self, instances): + pos_prevs = np.zeros(len(self.classes_), dtype=float) + for c in self.classes_: + pos_prevs[c] = self.estimators[c].quantify(instances)[1] + neg_prevs = 1-pos_prevs + return np.asarray([neg_prevs, pos_prevs]).T + + +class MultilabelRegressionQuantification: + def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500, norm=True, + means=True, stds=True): + assert regression in ['ridge'], 'unknown regression model' + self.estimator = MultilabelQuantifier(base_quantifier) + if regression == 'ridge': + self.reg = Ridge(normalize=norm) + # self.reg = MultiTaskLassoCV(normalize=norm) + # self.reg = KernelRidge(kernel='rbf') + # self.reg = LassoLarsCV(normalize=norm) + # self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien + #self.reg = LinearRegression(normalize=norm) # <- bien + # self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm + # self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm + # self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va + self.regression = regression + self.n_samples = n_samples + self.sample_size = sample_size + # self.norm = StandardScaler() + self.means = means + self.stds = stds + + def fit(self, data:MultilabelledCollection): + self.classes_ = data.classes_ + tr, te = data.train_test_split() + self.estimator.fit(tr) + samples_mean = [] + samples_std = [] + Xs = [] + ys = [] + for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): + ys.append(sample.prevalence()[:,1]) + Xs.append(self.estimator.quantify(sample.instances)[:,1]) + if self.means: + samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) + if self.stds: + samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + Xs = np.asarray(Xs) + ys = np.asarray(ys) + if self.means: + samples_mean = np.asarray(samples_mean) + Xs = np.hstack([Xs, samples_mean]) + if self.stds: + samples_std = np.asarray(samples_std) + Xs = np.hstack([Xs, samples_std]) + # Xs = self.norm.fit_transform(Xs) + self.reg.fit(Xs, ys) + return self + + def quantify(self, instances): + Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1) + if self.means: + sample_mean = instances.mean(axis=0).getA() + Xs = np.hstack([Xs, sample_mean]) + if self.stds: + sample_std = instances.todense().std(axis=0).getA() + Xs = np.hstack([Xs, sample_std]) + # Xs = self.norm.transform(Xs) + adjusted = self.reg.predict(Xs) + adjusted = np.clip(adjusted, 0, 1) + adjusted = adjusted.flatten() + neg_prevs = 1-adjusted + return np.asarray([neg_prevs, adjusted]).T + +sample_size = 250 +n_samples = 1000 + +def models(): + yield 'CC', MultilabelQuantifier(CC(cls())) + yield 'PCC', MultilabelQuantifier(PCC(cls())) + yield 'MLCC', MLCC(MultilabelClassifier(cls())) + yield 'MLPCC', MLPCC(MultilabelClassifier(cls())) + # yield 'PACC', MultilabelQuantifier(PACC(cls())) + # yield 'EMQ', MultilabelQuantifier(EMQ(calibratedCls())) + common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True} + # yield 'MRQ-CC', MultilabelRegressionQuantification(base_quantifier=CC(cls()), **common) + yield 'MRQ-PCC', MultilabelRegressionQuantification(base_quantifier=PCC(cls()), **common) + yield 'MRQ-PACC', MultilabelRegressionQuantification(base_quantifier=PACC(cls()), **common) + + +dataset = 'reuters21578' +data = Dataset.load(dataset, pickle_path=f'./pickles/{dataset}.pickle') + +Xtr, Xte = data.vectorize() +ytr = data.devel_labelmatrix.todense().getA() +yte = data.test_labelmatrix.todense().getA() + +most_populadted = np.argsort(ytr.sum(axis=0))[-25:] +ytr = ytr[:, most_populadted] +yte = yte[:, most_populadted] + +train = MultilabelledCollection(Xtr, ytr) +test = MultilabelledCollection(Xte, yte) + +print(f'Train-prev: {train.prevalence()[:,1]}') +print(f'Test-prev: {test.prevalence()[:,1]}') +print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}') + +# print('NPP:') +# test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=100)) +# for model_name, model in models(): +# model.fit(train) +# errs = [] +# for index in test_indexes: +# sample = test.sampling_from_index(index) +# estim_prevs = model.quantify(sample.instances) +# true_prevs = sample.prevalence() +# errs.append(qp.error.mae(true_prevs, estim_prevs)) +# print(f'{model_name:10s}\tmae={np.mean(errs):.5f}') + +print('APP:') +test_indexes = [] +for cat in train.classes_: + test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size, category=cat, n_prevalences=21, repeats=10))) + +for model_name, model in models(): + model.fit(train) + macro_errs = [] + for cat_indexes in test_indexes: + errs = [] + for index in cat_indexes: + sample = test.sampling_from_index(index) + estim_prevs = model.quantify(sample.instances) + true_prevs = sample.prevalence() + errs.append(qp.error.mae(true_prevs, estim_prevs)) + macro_errs.append(np.mean(errs)) + print(f'{model_name:10s}\tmae={np.mean(macro_errs):.5f}') + + + diff --git a/MultiLabel/util/__init__.py b/MultiLabel/util/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/MultiLabel/util/common.py b/MultiLabel/util/common.py new file mode 100755 index 0000000..285f46c --- /dev/null +++ b/MultiLabel/util/common.py @@ -0,0 +1,145 @@ +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) +import numpy as np +from tqdm import tqdm +import torch +from scipy.sparse import vstack, issparse +from joblib import Parallel, delayed +import multiprocessing +import itertools + + +def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): + """ + Index (i.e., replaces word strings with numerical indexes) a list of string documents + :param data: list of string documents + :param vocab: a fixed mapping [str]->[int] of words to indexes + :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained + because they are anyway contained in a pre-trained embedding set that we know in advance) + :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words + :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep + :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that + are not in the original vocab but that are in the known_words + :return: + """ + indexes=[] + vocabsize = len(vocab) + unk_count = 0 + knw_count = 0 + out_count = 0 + pbar = tqdm(data, desc=f'indexing documents') + for text in pbar: + words = analyzer(text) + index = [] + for word in words: + if word in vocab: + idx = vocab[word] + else: + if word in known_words: + if word not in out_of_vocabulary: + out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) + idx = out_of_vocabulary[word] + out_count += 1 + else: + idx = unk_index + unk_count += 1 + index.append(idx) + indexes.append(index) + knw_count += len(index) + pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' + f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') + return indexes + + +def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths)+np.std(lengths)) + + +def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i,indexes in enumerate(index_list): + index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] + return index_list + + +def get_word_list(word2index1, word2index2=None): #TODO: redo + def extract_word_list(word2index): + return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])] + word_list = extract_word_list(word2index1) + if word2index2 is not None: + word_list += extract_word_list(word2index2) + return word_list + + +def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500): + nsamples = len(index_list) + nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) + for b in range(nbatches): + batch = index_list[b*batchsize:(b+1)*batchsize] + batch_labels = labels[b*batchsize:(b+1)*batchsize] + if issparse(batch_labels): + batch_labels = batch_labels.toarray() + batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length) + batch = torch.LongTensor(batch) + totype = torch.LongTensor if target_long else torch.FloatTensor + target = totype(batch_labels) + yield batch.to(device), target.to(device) + + +def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500): + nsamples = len(index_list) + nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) + for b in range(nbatches): + batch = index_list[b*batchsize:(b+1)*batchsize] + batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length) + batch = torch.LongTensor(batch) + yield batch.to(device) + + +def clip_gradient(model, clip_value=1e-1): + params = list(filter(lambda p: p.grad is not None, model.parameters())) + for p in params: + p.grad.data.clamp_(-clip_value, clip_value) + + +def predict(logits, classification_type='singlelabel'): + if classification_type == 'multilabel': + prediction = torch.sigmoid(logits) > 0.5 + elif classification_type == 'singlelabel': + prediction = torch.argmax(logits, dim=1).view(-1, 1) + else: + print('unknown classification type') + + return prediction.detach().cpu().numpy() + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def get_parallel_slices(n_tasks, n_jobs=-1): + if n_jobs==-1: + n_jobs = multiprocessing.cpu_count() + batch = int(n_tasks / n_jobs) + remainder = n_tasks % n_jobs + return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)] + + +def tokenize_job(documents, tokenizer, max_tokens, job): + return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')] + + +def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1): + slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs) + tokens = Parallel(n_jobs=n_jobs)( + delayed(tokenize_job)( + documents[slice_i], tokenizer, max_tokens, job + ) + for job, slice_i in enumerate(slices) + ) + return list(itertools.chain.from_iterable(tokens)) + + diff --git a/MultiLabel/util/csv_log.py b/MultiLabel/util/csv_log.py new file mode 100755 index 0000000..eea83f7 --- /dev/null +++ b/MultiLabel/util/csv_log.py @@ -0,0 +1,60 @@ +import os +import pandas as pd +pd.set_option('display.max_rows', 500) +pd.set_option('display.max_columns', 500) +pd.set_option('display.width', 1000) + + +class CSVLog: + + def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False): + self.file = file + self.autoflush = autoflush + self.verbose = verbose + if os.path.exists(file) and not overwrite: + self.tell('Loading existing file from {}'.format(file)) + self.df = pd.read_csv(file, sep='\t') + self.columns = sorted(self.df.columns.values.tolist()) + else: + self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file)) + assert columns is not None, 'columns cannot be None' + self.columns = sorted(columns) + dir = os.path.dirname(self.file) + if dir and not os.path.exists(dir): os.makedirs(dir) + self.df = pd.DataFrame(columns=self.columns) + self.defaults = {} + + def already_calculated(self, **kwargs): + df = self.df + if df.shape[0] == 0: + return False + if len(kwargs) == 0: + kwargs = self.defaults + for key,val in kwargs.items(): + df = df.loc[df[key] == val] + if df.shape[0] == 0: + return False + return True + + def set_default(self, param, value): + self.defaults[param] = value + + def add_row(self, **kwargs): + for key in self.defaults.keys(): + if key not in kwargs: + kwargs[key]=self.defaults[key] + colums = sorted(list(kwargs.keys())) + values = [kwargs[col_i] for col_i in colums] + s = pd.Series(values, index=self.columns) + self.df = self.df.append(s, ignore_index=True) + if self.autoflush: self.flush() + self.tell(kwargs) + + def flush(self): + self.df.to_csv(self.file, index=False, sep='\t') + + def tell(self, msg): + if self.verbose: print(msg) + + + diff --git a/MultiLabel/util/dataset2leam_format.py b/MultiLabel/util/dataset2leam_format.py new file mode 100644 index 0000000..0679935 --- /dev/null +++ b/MultiLabel/util/dataset2leam_format.py @@ -0,0 +1,33 @@ +from data.dataset import Dataset +from tqdm import tqdm +import os +import numpy as np + + +def write_data(documents, labels, fout): + print(f'there are {len(documents)} documents') + written, empty = 0, 0 + with open(fout, 'wt') as foo: + for doc, label in tqdm(list(zip(documents, labels))): + doc = doc.replace('\t', ' ').replace('\n', ' ').strip() + label = np.squeeze(np.asarray(label.todense())) + label = ' '.join([f'{x}' for x in label]) + if doc: + foo.write(f'{label}\t{doc}\n') + written += 1 + else: + foo.write(f'{label}\tempty document\n') + empty += 1 + print(f'written = {written}') + print(f'empty = {empty}') + + +for dataset_name in ['reuters21578', 'ohsumed', 'jrcall', 'rcv1', 'wipo-sl-sc']: #'20newsgroups' + + dataset = Dataset.load(dataset_name=dataset_name, pickle_path=f'../pickles/{dataset_name}.pickle').show() + + os.makedirs(f'../leam/{dataset_name}', exist_ok=True) + write_data(dataset.devel_raw, dataset.devel_labelmatrix, f'../leam/{dataset_name}/train.csv') + #write_data(dataset.test_raw, dataset.test_labelmatrix, f'../leam/{dataset_name}/test.csv') + print('done') + diff --git a/MultiLabel/util/disable_sklearn_warnings.py b/MultiLabel/util/disable_sklearn_warnings.py new file mode 100755 index 0000000..e669983 --- /dev/null +++ b/MultiLabel/util/disable_sklearn_warnings.py @@ -0,0 +1,3 @@ +def warn(*args, **kwargs): pass +import warnings +warnings.warn = warn diff --git a/MultiLabel/util/early_stop.py b/MultiLabel/util/early_stop.py new file mode 100755 index 0000000..29c7991 --- /dev/null +++ b/MultiLabel/util/early_stop.py @@ -0,0 +1,54 @@ +#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py +import torch +from time import time +from util.file import create_if_not_exist + + +class EarlyStopping: + + def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'): + # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters + self.patience_limit = patience + self.patience = patience + self.verbose = verbose + self.best_score = None + self.best_epoch = None + self.stop_time = None + self.checkpoint = checkpoint + self.model = model + self.STOP = False + + def __call__(self, watch_score, epoch): + + if self.STOP: + return #done + + if self.best_score is None or watch_score >= self.best_score: + self.best_score = watch_score + self.best_epoch = epoch + self.stop_time = time() + if self.checkpoint: + self.print(f'[early-stop] improved, saving model in {self.checkpoint}') + torch.save(self.model, self.checkpoint) + else: + self.print(f'[early-stop] improved') + self.patience = self.patience_limit + else: + self.patience -= 1 + if self.patience == 0: + self.STOP = True + self.print(f'[early-stop] patience exhausted') + else: + if self.patience>0: # if negative, then early-stop is ignored + self.print(f'[early-stop] patience={self.patience}') + + def reinit_counter(self): + self.STOP = False + self.patience=self.patience_limit + + def restore_checkpoint(self): + return torch.load(self.checkpoint) + + def print(self, msg): + if self.verbose: + print(msg) diff --git a/MultiLabel/util/file.py b/MultiLabel/util/file.py new file mode 100755 index 0000000..0b7e669 --- /dev/null +++ b/MultiLabel/util/file.py @@ -0,0 +1,38 @@ +import urllib.request +from os import listdir, makedirs +from os.path import isdir, isfile, join, exists, dirname + + +def download_file(url, archive_filename): + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("") + + +def download_file_if_not_exists(url, archive_path): + if exists(archive_path): return + create_if_not_exist(dirname(archive_path)) + download_file(url,archive_path) + + +def ls(dir, typecheck): + el = [f for f in listdir(dir) if typecheck(join(dir, f))] + el.sort() + return el + + +def list_dirs(dir): + return ls(dir, typecheck=isdir) + + +def list_files(dir): + return ls(dir, typecheck=isfile) + + +def create_if_not_exist(path): + if not exists(path): makedirs(path) + diff --git a/MultiLabel/util/metrics.py b/MultiLabel/util/metrics.py new file mode 100755 index 0000000..0e3dda5 --- /dev/null +++ b/MultiLabel/util/metrics.py @@ -0,0 +1,86 @@ +import numpy as np +from scipy.sparse import lil_matrix, issparse +from sklearn.metrics import f1_score, accuracy_score + + +""" +Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. +I.e., when the number of true positives, false positives, and false negatives ammount to 0, all +affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. +We adhere to the common practice of outputting 1 in this case since the classifier has correctly +classified all examples as negatives. +""" + +def evaluation(y_true, y_pred, classification_type): + + if classification_type == 'multilabel': + eval_function = multilabel_eval + elif classification_type == 'singlelabel': + eval_function = singlelabel_eval + + Mf1, mf1, accuracy = eval_function(y_true, y_pred) + + return Mf1, mf1, accuracy + + +def multilabel_eval(y, y_): + + tp = y.multiply(y_) + + fn = lil_matrix(y.shape) + true_ones = y==1 + fn[true_ones]=1-tp[true_ones] + + fp = lil_matrix(y.shape) + pred_ones = y_==1 + if pred_ones.nnz>0: + fp[pred_ones]=1-tp[pred_ones] + + #macro-f1 + tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten() + fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten() + fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten() + + pos_pred = tp_macro+fp_macro + pos_true = tp_macro+fn_macro + prec=np.zeros(shape=tp_macro.shape,dtype=float) + rec=np.zeros(shape=tp_macro.shape,dtype=float) + np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0) + np.divide(tp_macro, pos_true, out=rec, where=pos_true>0) + den=prec+rec + + macrof1=np.zeros(shape=tp_macro.shape,dtype=float) + np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0) + macrof1 *=2 + + macrof1[(pos_pred==0)*(pos_true==0)]=1 + macrof1 = np.mean(macrof1) + + #micro-f1 + tp_micro = tp_macro.sum() + fn_micro = fn_macro.sum() + fp_micro = fp_macro.sum() + pos_pred = tp_micro + fp_micro + pos_true = tp_micro + fn_micro + prec = (tp_micro / pos_pred) if pos_pred>0 else 0 + rec = (tp_micro / pos_true) if pos_true>0 else 0 + den = prec+rec + microf1 = 2*prec*rec/den if den>0 else 0 + if pos_pred==pos_true==0: + microf1=1 + + #accuracy + ndecisions = np.multiply(*y.shape) + tn = ndecisions - (tp_micro+fn_micro+fp_micro) + acc = (tp_micro+tn)/ndecisions + + return macrof1,microf1,acc + + +def singlelabel_eval(y, y_): + if issparse(y_): y_ = y_.toarray().flatten() + macrof1 = f1_score(y, y_, average='macro') + microf1 = f1_score(y, y_, average='micro') + acc = accuracy_score(y, y_) + return macrof1,microf1,acc + diff --git a/MultiLabel/util/multilabelsvm.py b/MultiLabel/util/multilabelsvm.py new file mode 100755 index 0000000..c95530a --- /dev/null +++ b/MultiLabel/util/multilabelsvm.py @@ -0,0 +1,65 @@ +from sklearn.svm import LinearSVC +from sklearn.model_selection import GridSearchCV +import numpy as np +from joblib import Parallel, delayed +from time import time + + +class MLSVC: + """ + Multi-Label Support Vector Machine, with individual optimizations per binary problem. + """ + + def __init__(self, n_jobs=1, estimator=LinearSVC, *args, **kwargs): + self.n_jobs = n_jobs + self.args = args + self.kwargs = kwargs + self.verbose = False if 'verbose' not in self.kwargs else self.kwargs['verbose'] + self.estimator = estimator + + + def fit(self, X, y, **grid_search_params): + tini = time() + assert len(y.shape)==2 and set(np.unique(y).tolist()) == {0,1}, 'data format is not multi-label' + nD,nC = y.shape + prevalence = np.sum(y, axis=0) + self.svms = np.array([self.estimator(*self.args, **self.kwargs) for _ in range(nC)]) + if grid_search_params and grid_search_params['param_grid']: + self._print('grid_search activated with: {}'.format(grid_search_params)) + # Grid search cannot be performed if the category prevalence is less than the parameter cv. + # In those cases we place a svm instead of a gridsearchcv + cv = 5 if 'cv' not in grid_search_params else grid_search_params['cv'] + assert isinstance(cv, int), 'cv must be an int (other policies are not supported yet)' + self.svms = [GridSearchCV(svm_i, refit=True, **grid_search_params) if prevalence[i]>=cv else svm_i + for i,svm_i in enumerate(self.svms)] + for i in np.argwhere(prevalence==0).flatten(): + self.svms[i] = TrivialRejector() + + self.svms = Parallel(n_jobs=self.n_jobs)( + delayed(self.svms[c].fit)(X,y[:,c]) for c,svm in enumerate(self.svms) + ) + self.training_time = time() - tini + + + def predict(self, X): + return np.vstack(list(map(lambda svmi: svmi.predict(X), self.svms))).T + + + def predict_proba(self, X): + return np.vstack(map(lambda svmi: svmi.predict_proba(X)[:,np.argwhere(svmi.classes_==1)[0,0]], self.svms)).T + + + def _print(self, msg): + if self.verbose>0: + print(msg) + + + def best_params(self): + return [svmi.best_params_ if isinstance(svmi, GridSearchCV) else None for svmi in self.svms] + + +class TrivialRejector: + def fit(self,*args,**kwargs): return self + def predict(self, X): return np.zeros(X.shape[0]) + def predict_proba(self, X): return np.zeros(X.shape[0]) + diff --git a/multi_label.py b/multi_label.py deleted file mode 100644 index 28a5c38..0000000 --- a/multi_label.py +++ /dev/null @@ -1,224 +0,0 @@ -from copy import deepcopy - -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression, Ridge -from sklearn.metrics import f1_score -from sklearn.multiclass import OneVsRestClassifier -from sklearn.svm import LinearSVC - -import quapy as qp -from functional import artificial_prevalence_sampling -from method.aggregative import PACC, CC, EMQ -from method.base import BaseQuantifier -from quapy.data import from_rcv2_lang_file, LabelledCollection -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import MultiLabelBinarizer -import numpy as np - - -class MultilabelledCollection: - def __init__(self, instances, labels): - assert labels.ndim==2, 'data does not seem to be multilabel' - self.instances = instances - self.labels = labels - self.classes_ = np.arange(labels.shape[1]) - - @classmethod - def load(cls, path: str, loader_func: callable): - return MultilabelledCollection(*loader_func(path)) - - def __len__(self): - return self.instances.shape[0] - - def prevalence(self): - # return self.labels.mean(axis=0) - pos = self.labels.mean(axis=0) - neg = 1-pos - return np.asarray([neg, pos]).T - - def counts(self): - return self.labels.sum(axis=0) - - @property - def n_classes(self): - return len(self.classes_) - - @property - def binary(self): - return False - - def __gen_index(self): - return np.arange(len(self)) - - def sampling_multi_index(self, size, cat, prev=None): - if prev is None: # no prevalence was indicated; returns an index for uniform sampling - return np.random.choice(len(self), size, replace=size>len(self)) - aux = LabelledCollection(self.__gen_index(), self.instances[:,cat]) - return aux.sampling_index(size, *[1-prev, prev]) - - def uniform_sampling_multi_index(self, size): - return np.random.choice(len(self), size, replace=size>len(self)) - - def uniform_sampling(self, size): - unif_index = self.uniform_sampling_multi_index(size) - return self.sampling_from_index(unif_index) - - def sampling(self, size, category, prev=None): - prev_index = self.sampling_multi_index(size, category, prev) - return self.sampling_from_index(prev_index) - - def sampling_from_index(self, index): - documents = self.instances[index] - labels = self.labels[index, :] - return MultilabelledCollection(documents, labels) - - def train_test_split(self, train_prop=0.6, random_state=None): - tr_docs, te_docs, tr_labels, te_labels = \ - train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state) - return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels) - - def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1): - dimensions = 2 - for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): - yield self.sampling(sample_size, category, prevs[1]) - - def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1): - dimensions = 2 - for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): - yield self.sampling_multi_index(sample_size, category, prevs[1]) - - def natural_sampling_generator(self, sample_size, repeats=100): - for _ in range(repeats): - yield self.uniform_sampling(sample_size) - - def natural_sampling_index_generator(self, sample_size, repeats=100): - for _ in range(repeats): - yield self.uniform_sampling_multi_index(sample_size) - - def asLabelledCollection(self, category): - return LabelledCollection(self.instances, self.labels[:,category]) - - def genLabelledCollections(self): - for c in self.classes_: - yield self.asLabelledCollection(c) - - @property - def Xy(self): - return self.instances, self.labels - - -class MultilabelQuantifier: - def __init__(self, q:BaseQuantifier): - self.q = q - self.estimators = {} - - def fit(self, data:MultilabelledCollection): - self.classes_ = data.classes_ - for cat, lc in enumerate(data.genLabelledCollections()): - self.estimators[cat] = deepcopy(self.q).fit(lc) - return self - - def quantify(self, instances): - pos_prevs = np.zeros(len(self.classes_), dtype=float) - for c in self.classes_: - pos_prevs[c] = self.estimators[c].quantify(instances)[1] - neg_prevs = 1-pos_prevs - return np.asarray([neg_prevs, pos_prevs]).T - - -class MultilabelRegressionQuantification: - def __init__(self, base_quantifier=CC(LinearSVC()), regression='ridge', n_samples=500, sample_size=500): - self.estimator = MultilabelQuantifier(base_quantifier) - self.regression = regression - self.n_samples = n_samples - self.sample_size = sample_size - - def fit(self, data:MultilabelledCollection): - self.classes_ = data.classes_ - tr, te = data.train_test_split() - self.estimator.fit(tr) - Xs = [] - ys = [] - for sample in te.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): - ys.append(sample.prevalence()[:,1]) - Xs.append(self.estimator.quantify(sample.instances)[:,1]) - Xs = np.asarray(Xs) - ys = np.asarray(ys) - print(f'Xs in {Xs.shape}') - print(f'ys in {ys.shape}') - self.reg = Ridge().fit(Xs, ys) #normalize? - return self - - def quantify(self, instances): - Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1) - adjusted = self.reg.predict(Xs) - adjusted = np.clip(adjusted, 0, 1) - adjusted = adjusted.flatten() - neg_prevs = 1-adjusted - return np.asarray([neg_prevs, adjusted]).T - - - -# read documents -path = f'./crosslingual_data/rcv12/en.small.txt' -docs, cats = from_rcv2_lang_file(path) - -# split train-test -tr_docs, te_docs, tr_cats, te_cats = train_test_split(docs, cats, test_size=0.2, random_state=42) - -# generate Y matrices -mlb = MultiLabelBinarizer() -ytr = mlb.fit_transform([cats.split(' ') for cats in tr_cats]) -yte = mlb.transform([cats.split(' ') for cats in te_cats]) -# retain 10 most populated categories -most_populated = np.argsort(ytr.sum(axis=0))[-10:] -ytr = ytr[:,most_populated] -yte = yte[:,most_populated] - -tfidf = TfidfVectorizer(min_df=5) -Xtr = tfidf.fit_transform(tr_docs) -Xte = tfidf.transform(te_docs) - -train = MultilabelledCollection(Xtr, ytr) -test = MultilabelledCollection(Xte, yte) - -model = MultilabelQuantifier(PACC(LogisticRegression())) -model.fit(train) -estim_prevs = model.quantify(test.instances) -true_prevs = test.prevalence() -print('PACC:') -print(estim_prevs) -print(true_prevs) - - -model = MultilabelQuantifier(CC(LogisticRegression())) -model.fit(train) -estim_prevs = model.quantify(test.instances) -true_prevs = test.prevalence() -print('CC:') -print(estim_prevs) -print(true_prevs) - - -# model = MultilabelQuantifier(EMQ(LogisticRegression())) -# model.fit(train) -# estim_prevs = model.quantify(test.instances) -# true_prevs = test.prevalence() -# print('EMQ:') -# print(estim_prevs) -# print(true_prevs) - -model = MultilabelRegressionQuantification(sample_size=200, n_samples=500) -model.fit(train) -estim_prevs = model.quantify(test.instances) -true_prevs = test.prevalence() -print('MRQ:') -print(estim_prevs) -print(true_prevs) - -qp.environ['SAMPLE_SIZE']=100 -mae = qp.error.mae(true_prevs, estim_prevs) -print(mae) - - - diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 5b4d115..4e44fbb 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix from tqdm import tqdm +def from_rcv2_lang_file(path, encoding='utf-8'): + lines = open(path, 'rt', encoding=encoding).readlines() + parts = [l.split('\t') for l in lines] + docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts])) + return docs, cats + + def from_text(path, encoding='utf-8'): """ Reas a labelled colletion of documents. diff --git a/quapy/evaluation.py b/quapy/evaluation.py index ebdb537..8a68de4 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -105,7 +105,7 @@ def _predict_from_indexes( estim_prevalence = quantification_func(sample.instances) return true_prevalence, estim_prevalence - pbar = tqdm(indexes, desc='[artificial sampling protocol] generating predictions') if verbose else indexes + pbar = tqdm(indexes, desc='[sampling protocol] generating predictions') if verbose else indexes results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs) true_prevalences, estim_prevalences = zip(*results) diff --git a/quapy/method/meta.py b/quapy/method/meta.py index fc3efe3..e164f75 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -227,7 +227,7 @@ def _delayed_new_instance(args): if val_split is not None: if isinstance(val_split, float): assert 0 < val_split < 1, 'val_split should be in (0,1)' - data, val_split = data.split_stratified(train_prop=1 - val_split) + data, val_split = data.train_test_split(train_prop=1 - val_split) sample_index = data.sampling_index(sample_size, *prev) sample = data.sampling_from_index(sample_index) diff --git a/quapy/method/neural.py b/quapy/method/neural.py index 5b85291..c8884c6 100644 --- a/quapy/method/neural.py +++ b/quapy/method/neural.py @@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier): if fit_learner: classifier_data, unused_data = data.split_stratified(0.4) - train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% + train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20% self.learner.fit(*classifier_data.Xy) else: classifier_data = None diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 1080db0..f05b249 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier): return training, validation elif isinstance(validation, float): assert 0. < validation < 1., 'validation proportion should be in (0,1)' - training, validation = training.split_stratified(train_prop=1 - validation) + training, validation = training.train_test_split(train_prop=1 - validation) return training, validation else: raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'