import os, sys from os.path import join import tarfile import xml.etree.ElementTree as ET from sklearn.datasets import get_data_home import pickle import rdflib from rdflib.namespace import RDF, SKOS from rdflib import URIRef import zipfile from collections import Counter from tqdm import tqdm from random import shuffle from util.file import * class JRCAcquis_Document: def __init__(self, id, name, lang, year, head, body, categories): self.id = id self.parallel_id = name self.lang = lang self.year = year self.text = body if not head else head + "\n" + body self.categories = categories @classmethod def get_text(cls, jrc_documents): return [d.text for d in jrc_documents] @classmethod def get_target(cls, jrc_documents): return [d.categories for d in jrc_documents] # this is a workaround... for some reason, acutes are codified in a non-standard manner in titles # however, it seems that the title is often appearing as the first paragraph in the text/body (with # standard codification), so it might be preferable not to read the header after all (as here by default) def _proc_acute(text): for ch in ['a','e','i','o','u']: text = text.replace('%'+ch+'acute%',ch) return text def parse_document(file, year, head=False): root = ET.parse(file).getroot() doc_name = root.attrib['n'] # e.g., '22006A0211(01)' doc_lang = root.attrib['lang'] # e.g., 'es' doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) def raise_if_empty(field, from_file): if isinstance(field, str): if not field.strip(): raise ValueError("Empty field in file %s" % from_file) raise_if_empty(doc_name, file) raise_if_empty(doc_lang, file) raise_if_empty(doc_id, file) if head: raise_if_empty(doc_head, file) raise_if_empty(doc_body, file) return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) #filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter def _filter_by_category(doclist, cat_filter): if not isinstance(cat_filter, frozenset): cat_filter = frozenset(cat_filter) filtered = [] for doc in doclist: doc.categories = list(cat_filter & set(doc.categories)) if doc.categories: doc.categories.sort() filtered.append(doc) print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) return filtered #filters out categories with less than cat_threshold documents (and filters documents containing those categories) def _filter_by_frequency(doclist, cat_threshold): cat_count = Counter() for d in doclist: cat_count.update(d.categories) freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] freq_categories.sort() return _filter_by_category(doclist, freq_categories), freq_categories #select top most_frequent categories (and filters documents containing those categories) def _most_common(doclist, most_frequent): cat_count = Counter() for d in doclist: cat_count.update(d.categories) freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] freq_categories.sort() return _filter_by_category(doclist, freq_categories), freq_categories def _get_categories(request): final_cats = set() for d in request: final_cats.update(d.categories) return list(final_cats) def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): if not data_path: data_path = get_data_home() if not os.path.exists(data_path): os.mkdir(data_path) request = [] total_read = 0 file_name = 'jrc-' + lang + '.tgz' archive_path = join(data_path, file_name) if not os.path.exists(archive_path): print("downloading language-specific dataset (once and for all) into %s" % data_path) DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) download_file(DOWNLOAD_URL, archive_path) print("untarring dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) documents_dir = join(data_path, lang) print("Reading documents...") read = 0 for dir in list_dirs(documents_dir): year = int(dir) if years==None or year in years: year_dir = join(documents_dir,dir) l_y_documents = [] all_documents = list_files(year_dir) empty = 0 pbar = tqdm(enumerate(all_documents)) for i,doc_file in pbar: try: jrc_doc = parse_document(join(year_dir, doc_file), year) except ValueError: jrc_doc = None if jrc_doc and (not ignore_unclassified or jrc_doc.categories): l_y_documents.append(jrc_doc) else: empty += 1 read+=1 pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields') request += l_y_documents print("Read %d documents for language %s\n" % (read, lang)) total_read += read final_cats = _get_categories(request) if cat_filter: request = _filter_by_category(request, cat_filter) final_cats = _get_categories(request) if cat_threshold > 0: request, final_cats = _filter_by_frequency(request, cat_threshold) if most_frequent != -1 and len(final_cats) > most_frequent: request, final_cats = _most_common(request, most_frequent) return request, final_cats def print_cat_analysis(request): cat_count = Counter() for d in request: cat_count.update(d.categories) print("Number of active categories: {}".format(len(cat_count))) print(cat_count.most_common()) # inspects the Eurovoc thesaurus in order to select a subset of categories # currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", select="broadest"): fullpath_pickle = join(data_path, select+'_concepts.pickle') if os.path.exists(fullpath_pickle): print("Pickled object found in %s. Loading it." % fullpath_pickle) return pickle.load(open(fullpath_pickle,'rb')) fullpath = join(data_path, eurovoc_skos_core_concepts_filename) if not os.path.exists(fullpath): print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) download_file(eurovoc_url, fullpath) print("Unzipping file...") zipped = zipfile.ZipFile(data_path + '.zip', 'r') zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) zipped.close() print("Parsing %s" %fullpath) g = rdflib.Graph() g.parse(location=fullpath, format="application/rdf+xml") if select == "all": print("Selecting all concepts") all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] all_concepts.sort() selected_concepts = all_concepts elif select=="broadest": print("Selecting broadest concepts (those without any other broader concept linked to it)") all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) narrower_concepts = set(g.subjects(SKOS.broader, None)) broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] broadest_concepts.sort() selected_concepts = broadest_concepts elif select=="leaves": print("Selecting leaves concepts (those not linked as broader of any other concept)") all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) broad_concepts = set(g.objects(None, SKOS.broader)) leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] leave_concepts.sort() selected_concepts = leave_concepts else: raise ValueError("Selection policy %s is not currently supported" % select) print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) print("Pickling concept list for faster further requests in %s" % fullpath_pickle) pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) return selected_concepts if __name__ == '__main__': # example code train_years = list(range(1986, 2006)) test_years = [2006] cat_policy = 'all' #'leaves' most_common_cat = 300 JRC_DATAPATH = "../datasets/JRC_Acquis_v3" cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years, cat_filter=None, cat_threshold=1, most_frequent=most_common_cat) test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years, cat_filter=tr_cats, cat_threshold=1) # training_cats = jrc_get_categories(training_docs) # test_cats = jrc_get_categories(test_docs) # intersection_cats = [c for c in training_cats if c in test_cats] # training_docs = jrc_filter_by_category(training_docs, intersection_cats) # test_docs = jrc_filter_by_category(test_docs, intersection_cats) print(f'JRC-train: {len(training_docs)} documents') print(f'JRC-test: {len(test_docs)} documents') print_cat_analysis(training_docs) print_cat_analysis(test_docs) """ JRC-train: 12615 documents, 300 cats JRC-test: 7055 documents, 300 cats """