QuaPy/MultiLabel/data/jrcacquis_reader.py

import os, sys
from os.path import join
import tarfile
import xml.etree.ElementTree as ET
from sklearn.datasets import get_data_home
import pickle
import rdflib
from rdflib.namespace import RDF, SKOS
from rdflib import URIRef
import zipfile
from collections import Counter
from tqdm import tqdm
from random import shuffle
from util.file import *


class JRCAcquis_Document:
    def __init__(self, id, name, lang, year, head, body, categories):
        self.id = id
        self.parallel_id = name
        self.lang = lang
        self.year = year
        self.text = body if not head else head + "\n" + body
        self.categories = categories

    @classmethod
    def get_text(cls, jrc_documents):
        return [d.text for d in jrc_documents]

    @classmethod
    def get_target(cls, jrc_documents):
        return [d.categories for d in jrc_documents]


# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
    for ch in ['a','e','i','o','u']:
        text = text.replace('%'+ch+'acute%',ch)
    return text

def parse_document(file, year, head=False):
    root = ET.parse(file).getroot()

    doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
    doc_lang = root.attrib['lang'] # e.g., 'es'
    doc_id   = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
    doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
    doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
    doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])

    def raise_if_empty(field, from_file):
        if isinstance(field, str):
            if not field.strip():
                raise ValueError("Empty field in file %s" % from_file)

    raise_if_empty(doc_name, file)
    raise_if_empty(doc_lang, file)
    raise_if_empty(doc_id, file)
    if head: raise_if_empty(doc_head, file)
    raise_if_empty(doc_body, file)

    return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)

#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
    if not isinstance(cat_filter, frozenset):
        cat_filter = frozenset(cat_filter)
    filtered = []
    for doc in doclist:
        doc.categories = list(cat_filter & set(doc.categories))
        if doc.categories:
            doc.categories.sort()
            filtered.append(doc)
    print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
    return filtered

#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
    cat_count = Counter()
    for d in doclist:
        cat_count.update(d.categories)

    freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
    freq_categories.sort()
    return _filter_by_category(doclist, freq_categories), freq_categories

#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
    cat_count = Counter()
    for d in doclist:
        cat_count.update(d.categories)

    freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
    freq_categories.sort()
    return _filter_by_category(doclist, freq_categories), freq_categories

def _get_categories(request):
    final_cats = set()
    for d in request:
        final_cats.update(d.categories)
    return list(final_cats)

def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True,
                    cat_filter=None, cat_threshold=0, most_frequent=-1,
                    DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):

    if not data_path:
        data_path = get_data_home()

    if not os.path.exists(data_path):
        os.mkdir(data_path)

    request = []
    total_read = 0
    file_name = 'jrc-' + lang + '.tgz'
    archive_path = join(data_path, file_name)

    if not os.path.exists(archive_path):
        print("downloading language-specific dataset (once and for all) into %s" % data_path)
        DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
        download_file(DOWNLOAD_URL, archive_path)
        print("untarring dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)

    documents_dir = join(data_path, lang)

    print("Reading documents...")
    read = 0
    for dir in list_dirs(documents_dir):
        year = int(dir)
        if years==None or year in years:
            year_dir = join(documents_dir,dir)
            l_y_documents = []
            all_documents = list_files(year_dir)
            empty = 0
            pbar = tqdm(enumerate(all_documents))
            for i,doc_file in pbar:
                try:
                    jrc_doc = parse_document(join(year_dir, doc_file), year)
                except ValueError:
                    jrc_doc = None

                if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
                    l_y_documents.append(jrc_doc)
                else: empty += 1
                read+=1
                pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields')
            request += l_y_documents
    print("Read %d documents for language %s\n" % (read, lang))
    total_read += read

    final_cats = _get_categories(request)

    if cat_filter:
        request = _filter_by_category(request, cat_filter)
        final_cats = _get_categories(request)
    if cat_threshold > 0:
        request, final_cats = _filter_by_frequency(request, cat_threshold)
    if most_frequent != -1 and len(final_cats) > most_frequent:
        request, final_cats = _most_common(request, most_frequent)

    return request, final_cats

def print_cat_analysis(request):
    cat_count = Counter()
    for d in request:
        cat_count.update(d.categories)
    print("Number of active categories: {}".format(len(cat_count)))
    print(cat_count.most_common())

# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
                    eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
                    select="broadest"):

    fullpath_pickle = join(data_path, select+'_concepts.pickle')
    if os.path.exists(fullpath_pickle):
        print("Pickled object found in %s. Loading it." % fullpath_pickle)
        return pickle.load(open(fullpath_pickle,'rb'))

    fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
    if not os.path.exists(fullpath):
        print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
        download_file(eurovoc_url, fullpath)
        print("Unzipping file...")
        zipped = zipfile.ZipFile(data_path + '.zip', 'r')
        zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
        zipped.close()

    print("Parsing %s" %fullpath)
    g = rdflib.Graph()
    g.parse(location=fullpath, format="application/rdf+xml")

    if select == "all":
        print("Selecting all concepts")
        all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
        all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
        all_concepts.sort()
        selected_concepts = all_concepts
    elif select=="broadest":
        print("Selecting broadest concepts (those without any other broader concept linked to it)")
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        narrower_concepts = set(g.subjects(SKOS.broader, None))
        broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
        broadest_concepts.sort()
        selected_concepts = broadest_concepts
    elif select=="leaves":
        print("Selecting leaves concepts (those not linked as broader of any other concept)")
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        broad_concepts = set(g.objects(None, SKOS.broader))
        leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
        leave_concepts.sort()
        selected_concepts = leave_concepts
    else:
        raise ValueError("Selection policy %s is not currently supported" % select)

    print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
    print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
    pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)

    return selected_concepts


if __name__ == '__main__':

    # example code

    train_years = list(range(1986, 2006))
    test_years = [2006]
    cat_policy = 'all' #'leaves'
    most_common_cat = 300
    JRC_DATAPATH = "../datasets/JRC_Acquis_v3"
    cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)

    training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years,
                                                 cat_filter=None, cat_threshold=1,
                                                 most_frequent=most_common_cat)
    test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years,
                                                 cat_filter=tr_cats, cat_threshold=1)
    # training_cats = jrc_get_categories(training_docs)
    # test_cats     = jrc_get_categories(test_docs)
    # intersection_cats = [c for c in training_cats if c in test_cats]

    # training_docs = jrc_filter_by_category(training_docs, intersection_cats)
    # test_docs = jrc_filter_by_category(test_docs, intersection_cats)


    print(f'JRC-train: {len(training_docs)} documents')
    print(f'JRC-test: {len(test_docs)} documents')

    print_cat_analysis(training_docs)
    print_cat_analysis(test_docs)

    """
    JRC-train: 12615 documents, 300 cats
    JRC-test: 7055 documents, 300 cats
    """