{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Test per Parsing e generazione IRI" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ast\n", "import sys\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "# importing useful Python utility libraries we'll need\n", "from collections import Counter, defaultdict\n", "import itertools" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def read_tei(tei_file):\n", " with open(tei_file, 'r') as tei:\n", " soup = BeautifulSoup(tei, 'lxml')\n", " return soup\n", " raise RuntimeError('Cannot generate a soup from the input')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def elem_to_text(elem, default=''):\n", " if elem:\n", " return elem.getText(separator=' ', strip=True)\n", " else:\n", " return default" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "\n", "@dataclass\n", "class Person:\n", " firstname: str\n", " middlename: str\n", " surname: str" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parser\n", "\n", "Provo a creare un parser.\n", "\n", "Un estratto dal file inferno.xml:\n", "\n", "~~~~\n", " Canto 1\n", "\n", " \n", " Nel\n", " mezzo\n", " del\n", " cammin\n", " di\n", " nostra\n", " vita\n", " \n", " ...\n", " ...\n", " \n", " che\n", " \n", " \t nel \n", " \t nel\n", " \n", " pensier\n", " rinova\n", " la\n", " paura!\n", " \n", " \n", " ...\n", "~~~~\n", "\n", " \n", "Il tag \\ individua la porzione di file di un *Canto*, il tag \\ individua un verso, il tag \\ individua una *forma flessa*, ciascuna forma flessa ha 1 o 2 attributi.\n", "All'interno di un verso può essere presente il tag \\ che ha come content più elementi \\, ciascuno di essi contiene la stessa forma flessa ma differenti valori per gli attributi 'catg' e 'lemma'.\n", "\n", "per questa implementazione uso la libreria Python [Beatiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)." ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "class TEIFile(object):\n", " def __init__(self, filename, idres=0):\n", " self.filename = filename\n", " self.soup = read_tei(filename)\n", " self._text = None\n", " self.idres=idres;\n", " # self._lemmas = None\n", " # self._lemma_lemmas = None\n", " # self._categ_lemmas = None\n", " self._title = ''\n", " self._abstract = ''\n", "\n", " \n", " @property\n", " def title(self):\n", " if not self._title:\n", " if not self.soup.title:\n", " self._title = \"na\"\n", " else:\n", " self._title = self.soup.title.getText().replace('\\n','').strip()\n", " return self._title\n", "\n", " \n", " @property\n", " def authors(self):\n", " #authors_in_header = self.soup.analytic.find_all('author')\n", " authors_in_header = self.soup.find_all('author')\n", "\n", " result = []\n", " for author in authors_in_header:\n", " persname = author.persname\n", " if not persname:\n", " continue\n", " firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n", " middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n", " surname = elem_to_text(persname.surname)\n", " person = Person(firstname, middlename, surname)\n", " result.append(person)\n", " return result\n", " \n", " @property\n", " def bibliography(self):\n", " bibliography = self.soup.find_all('bibl')\n", " result = []\n", " for bibl in bibliography:\n", " if not bibl:\n", " continue\n", " #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n", " # continue\n", " my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n", " my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n", " result.append(\" \".join(my_bibl_tmp.split()))\n", " return result\n", "\n", "\n", " @property\n", " def text(self):\n", " if not self._text:\n", " divs_text = []\n", " for div in self.soup.body.find_all(\"div1\"):\n", " # div is neither an appendix nor references, just plain text.\n", " if not div.get(\"type\"):\n", " div_text = div.get_text(separator=' ', strip=True)\n", " divs_text.append(div_text)\n", "\n", " plain_text = \" \".join(divs_text)\n", " self._text = plain_text\n", " return self._text\n", " \n", " @property\n", " def orderedlemma(self):\n", " ordr_lms = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " j=0;\n", " for lm in verso.find_all(\"lm\"):\n", " lstctg=[];\n", " lstlms=[];\n", " j=j+1;\n", " lm_text=elem_to_text(lm).strip();\n", " #ctg=lm.get('catg');\n", " if (lm.get('catg')!=None):\n", " ctg=lm.get('catg');\n", " else:\n", " ctg=\"non_spec\";\n", " \n", " lstctg.append(\" \".join(ctg.split())); \n", " \n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " lstlms.append(\" \".join(lemma.split())); \n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0];\n", " if (parent.name=='lm1' and ordr_lms[-1][0]==\" \".join(lm_text.split())):\n", " j=j-1;\n", " lstctg=lstctg+ordr_lms[-1][1];\n", " lstlms=lstlms+ordr_lms[-1][2];\n", " ordr_lms.pop();\n", " \n", " ordr_lms.append((\" \".join(lm_text.split()), lstctg, lstlms, canto.replace('\\n','').strip(), i, j));\n", " \n", " \n", " # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n", " # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n", " \n", " \n", " return ordr_lms\n", " #IRI del verso\n", " @property\n", " def IRIverso(self):\n", " iris = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " islm1=False;\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " lm1_text=[];\n", " verso_text=elem_to_text(verso).strip();\n", " for child in verso.children: #Manage elements\n", " if (child.name=='lm1'):\n", " islm1=True;\n", " lm1_text.append(elem_to_text(child).strip());\n", " # print (lm1_text);\n", " \n", " if(islm1):\n", " islm1=False;\n", " for lm1str in lm1_text:\n", " replace_str=lm1str.partition(' ')[0];\n", " verso_text=verso_text.replace(lm1str, replace_str);\n", " \n", " for vparent in verso.parents:\n", " if (vparent.name=='div1'):\n", " canto = vparent.contents[0];\n", " #\" \".join(verso_text.split())).strip()\n", " verso_text=verso_text.replace(\" ,\", \",\");\n", " verso_text=verso_text.replace(\" .\", \".\");\n", " verso_text=verso_text.replace(\" !\", \"!\");\n", " verso_text=verso_text.replace(\" ?\", \"?\");\n", " verso_text=verso_text.replace(\"l' \", \"l'\");\n", " iri_verso=\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n", " iri_verso=iri_verso+'\\n a efrbroo:F2_Expression ,\\n rdfs:Resource ; \\nhttp://erlangen-crm.org/current/P190_has_symbolic_content \"';\n", " iri_verso=iri_verso+verso_text.strip()+ '\"^^xsd:string ;\\n http://erlangen-crm.org/current/P3_has_note';\n", " iri_verso=iri_verso+' \"'+str(i)+'\"^^xsd:int ;\\n http://hdn.dantenetwork.it/resource/has_number \"'+str(i)+'\"^^xsd:int .'\n", " \n", " iris.append((i, verso_text.strip(), iri_verso));\n", " \n", " \n", " return iris\n", " \n", " \n", " #test\n", " @property\n", " def ff_ea(self):\n", " lms_text = []\n", " lms_tupl=()\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " lm_text=elem_to_text(lm).strip()\n", " ctg=lm.get('catg');\n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0]\n", " break;\n", " lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n", " return lms_text\n", " \n", " @property\n", " def categ_lemma(self):\n", " ctgs_text = []\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " ctg_text=lm.get('catg').strip();\n", " ctgs_text.append(\" \".join(ctg_text.split()))\n", " return ctgs_text\n", " \n", " @property\n", " def lemma_lemma(self):\n", " lemmas_text = []\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " if (lm.get('lemma')):\n", " lemma_text=lm.get('lemma').strip();\n", " else:\n", " lemma_text='non_spec';\n", " lemmas_text.append(\" \".join(lemma_text.split()))\n", " return lemmas_text" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def tei_to_csv_entry(tei_file, idres=0):\n", " tei = TEIFile(tei_file, idres)\n", " print(f\"Handled {tei_file}\")\n", " base_name = tei_file\n", " return tei.orderedlemma, tei.IRIverso, tei.categ_lemma, tei.lemma_lemma #, tei.abstract" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Provo a vedere se il parser funziona\n", "Dovrebbe arrivare sino al termine 'oscuro', controllare!" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('Nel', 'rdms', 'il', 'Canto 1')\n", "\n", "('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1')\n", "\n", "('del', 'rdms', 'il', 'Canto 1')\n", "\n", "('cammin', 'sm2ms', 'cammino', 'Canto 1')\n", "\n", "('di', 'epskg', 'di', 'Canto 1')\n", "\n", "('nostra', 'as1fs', 'nostro', 'Canto 1')\n", "\n", "('vita', 'sf1fs', 'vita', 'Canto 1')\n", "\n", "('mi', 'pf1sypr', 'mi', 'Canto 1')\n", "\n", "('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1')\n", "\n", "('per', 'epskpl', 'per', 'Canto 1')\n", "\n", "('una', 'rifs', 'una', 'Canto 1')\n", "\n", "('selva', 'sf1fs', 'selva', 'Canto 1')\n", "\n", "('oscura', 'a1fs', 'oscuro', 'Canto 1')\n", "\n", "...\n" ] } ], "source": [ "tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)\n", "bbs=tei.ff_ea\n", "for re in bbs:\n", " print (re, end=\"\\n\"*2)\n", " if (re[0].startswith('oscura')):\n", " print('...')\n", " break" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elaboro il file *inferno.xml*\n", "Eseguo il parsing del testo presente nel file e creo una tabella con le seguenti colonne: *forma flessa, categoria, lemma, canto, verso, pposizione forma flessa nel verso*" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml\n" ] } ], "source": [ "mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FormaFlessa 33400\n", "Categoria 33400\n", "Lemma 33400\n", "Canto 33400\n", "Verso 33400\n", "PosizioneFFNelVerso 33400\n", "dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = [mytesto[0]]\n", "#data[0]\n", "dfObj = pd.DataFrame(data[0]) \n", "testo_tabella=pd.DataFrame(data[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n", "testo_tabella.count()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FormaFlessaCategoriaLemmaCantoVersoPosizioneFFNelVerso
0Nel[rdms][il]Canto 111
1mezzo[eilaksl][in mezzo di]Canto 112
2del[rdms][il]Canto 113
3cammin[sm2ms][cammino]Canto 114
4di[epskg][di]Canto 115
5nostra[as1fs][nostro]Canto 116
6vita[sf1fs][vita]Canto 117
7mi[pf1sypr][mi]Canto 121
8ritrovai[vta+1irs1][ritrovare]Canto 122
9per[epskpl][per]Canto 123
\n", "
" ], "text/plain": [ " FormaFlessa Categoria Lemma Canto Verso PosizioneFFNelVerso\n", "0 Nel [rdms] [il] Canto 1 1 1\n", "1 mezzo [eilaksl] [in mezzo di] Canto 1 1 2\n", "2 del [rdms] [il] Canto 1 1 3\n", "3 cammin [sm2ms] [cammino] Canto 1 1 4\n", "4 di [epskg] [di] Canto 1 1 5\n", "5 nostra [as1fs] [nostro] Canto 1 1 6\n", "6 vita [sf1fs] [vita] Canto 1 1 7\n", "7 mi [pf1sypr] [mi] Canto 1 2 1\n", "8 ritrovai [vta+1irs1] [ritrovare] Canto 1 2 2\n", "9 per [epskpl] [per] Canto 1 2 3" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testo_tabella.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creo una tabella con gli IRI dei versi per la cantica *Inferno*\n", "\n", "La abella contiene il numero del verso, il verso e l'IRI del verso. \n", "Per l'IRI del verso mi son basato su quanto riportato nel file *Commedia.rdf*, un esempio è il seguente: \n", "\n", "> \n", "> a efrbroo:F2_Expression , rdfs:Resource ; \n", "> \n", "> \"Per li tre gradi sù di buona voglia\"^^xsd:string ; \n", "> \n", "> \"106\"^^xsd:int ; \n", "> \n", "> \"106\"^^xsd:int . \n", "\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NumeroVerso 4721\n", "Verso 4721\n", "IRIVerso 4721\n", "dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_IRI_versi_inf = [mytesto[1]]\n", "#data_IRI_versi\n", "df_IRI_versi_inf=pd.DataFrame(data_IRI_versi_inf[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n", "df_IRI_versi_inf.count()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NumeroVerso Verso IRIVerso
01Nel mezzo del cammin di nostra vitahttp://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/1\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"Nel mezzo del cammin di nostra vita\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"1\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"1\"^^xsd:int .
12mi ritrovai per una selva oscurahttp://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/2\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"mi ritrovai per una selva oscura\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"2\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"2\"^^xsd:int .
23ché la diritta via era smarrita.http://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/3\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"ché la diritta via era smarrita.\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"3\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"3\"^^xsd:int .
34Ahi quanto a dir qual era è cosa durahttp://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/4\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"Ahi quanto a dir qual era è cosa dura\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"4\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"4\"^^xsd:int .
45esta selva selvaggia e aspra e fortehttp://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/5\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"esta selva selvaggia e aspra e forte\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"5\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"5\"^^xsd:int .
" ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_IRI_versi_inf.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elaborazione del file *purgatorio.xml*\n", "Eseguo il parsing del testo presente nel file e creo una tabella simile alla precedente" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "#TEST IGNORARE\n", "#tei_purgatorio = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)\n", "#bbs_pu=tei_purgatorio.IRIverso\n", "#for repu in bbs_pu:\n", "# print (repu, end=\"\\n\"*2)\n", "# if (repu[0].startswith('che')):\n", "# print('...')\n", "# break" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml\n" ] } ], "source": [ "parsed_purgatorio=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FormaFlessa 33245\n", "Categoria 33245\n", "Lemma 33245\n", "Canto 33245\n", "Verso 33245\n", "PosizioneFFNelVerso 33245\n", "dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_purgatorio = [parsed_purgatorio[0]]\n", "#dfObj_purgatorio = pd.DataFrame(data_purgatorio[0]) \n", "testo_purgatorio_tabella=pd.DataFrame(data_purgatorio[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n", "testo_purgatorio_tabella.count()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FormaFlessaCategoriaLemmaCantoVersoPosizioneFFNelVerso
33240disposto[vtp2pra1ms][disporre]Canto 3347553
33241a[epsb][a]Canto 3347554
33242salire[vi3fp][salire]Canto 3347555
33243alle[rdfp, epakml][la, a]Canto 3347556
33244stelle[sf1fp][stella]Canto 3347557
\n", "
" ], "text/plain": [ " FormaFlessa Categoria Lemma Canto Verso \\\n", "33240 disposto [vtp2pra1ms] [disporre] Canto 33 4755 \n", "33241 a [epsb] [a] Canto 33 4755 \n", "33242 salire [vi3fp] [salire] Canto 33 4755 \n", "33243 alle [rdfp, epakml] [la, a] Canto 33 4755 \n", "33244 stelle [sf1fp] [stella] Canto 33 4755 \n", "\n", " PosizioneFFNelVerso \n", "33240 3 \n", "33241 4 \n", "33242 5 \n", "33243 6 \n", "33244 7 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testo_purgatorio_tabella.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creazione di una tabella con gli IRI dei versi per la cantica *Purgatorio*\n", "\n", "La tabella contiene il numero del verso, il verso e l'IRI del verso. " ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NumeroVerso 4755\n", "Verso 4755\n", "IRIVerso 4755\n", "dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_IRI_versi_pur = [parsed_purgatorio[1]]\n", "#data_IRI_versi\n", "df_IRI_versi_pur=pd.DataFrame(data_IRI_versi_pur[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n", "df_IRI_versi_pur.count()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NumeroVerso Verso IRIVerso
01Per correr miglior acque alza le velehttp://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/1\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"Per correr miglior acque alza le vele\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"1\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"1\"^^xsd:int .
12omai la navicella del mio ingegno,http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/2\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"omai la navicella del mio ingegno,\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"2\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"2\"^^xsd:int .
23che lascia dietro a sé mar sì crudele;http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/3\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"che lascia dietro a sé mar sì crudele;\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"3\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"3\"^^xsd:int .
34e canterò di quel secondo regnohttp://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/4\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"e canterò di quel secondo regno\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"4\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"4\"^^xsd:int .
45dove l' umano spirito si purgahttp://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/5\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"dove l' umano spirito si purga\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"5\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"5\"^^xsd:int .
" ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_IRI_versi_pur.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elaborazione del file paradiso.xml\n", "Eseguo il parsing del testo presente nel file e creo una tabella simile alle precedenti" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/paradiso.xml\n" ] } ], "source": [ "parsed_paradiso=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/paradiso.xml', 3)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FormaFlessa 32747\n", "Categoria 32747\n", "Lemma 32747\n", "Canto 32747\n", "Verso 32747\n", "PosizioneFFNelVerso 32747\n", "dtype: int64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_paradiso = [parsed_paradiso[0]]\n", "testo_paradiso_tabella=pd.DataFrame(data_paradiso[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n", "testo_paradiso_tabella.count()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FormaFlessaCategoriaLemmaCantoVersoPosizioneFFNelVerso
0La[rdfs][la]Canto 111
1gloria[sf1fs][gloria]Canto 112
2di[epskg][di]Canto 113
3colui[pdms][colui]Canto 114
4che[pr][che]Canto 115
5tutto[pi1ms][tutto]Canto 116
6move[vta2ips3][muovere]Canto 117
7per[epskpl][per]Canto 121
8l'[rdms][lo]Canto 122
9universo[sm2ms][universo]Canto 123
10penetra[vi1ips3][penetrare]Canto 124
11e[cscc][e]Canto 125
12risplende[vi2ips3][risplendere]Canto 126
13in[epsksl][in]Canto 131
14una[rifs][una]Canto 132
15parte[sf3fs][parte]Canto 133
16più[b][più]Canto 134
17e[cscc][e]Canto 135
18meno[b][meno]Canto 136
19altrove[b][altrove]Canto 137
20Nel[epaksl, rdms][in, il]Canto 141
\n", "
" ], "text/plain": [ " FormaFlessa Categoria Lemma Canto Verso \\\n", "0 La [rdfs] [la] Canto 1 1 \n", "1 gloria [sf1fs] [gloria] Canto 1 1 \n", "2 di [epskg] [di] Canto 1 1 \n", "3 colui [pdms] [colui] Canto 1 1 \n", "4 che [pr] [che] Canto 1 1 \n", "5 tutto [pi1ms] [tutto] Canto 1 1 \n", "6 move [vta2ips3] [muovere] Canto 1 1 \n", "7 per [epskpl] [per] Canto 1 2 \n", "8 l' [rdms] [lo] Canto 1 2 \n", "9 universo [sm2ms] [universo] Canto 1 2 \n", "10 penetra [vi1ips3] [penetrare] Canto 1 2 \n", "11 e [cscc] [e] Canto 1 2 \n", "12 risplende [vi2ips3] [risplendere] Canto 1 2 \n", "13 in [epsksl] [in] Canto 1 3 \n", "14 una [rifs] [una] Canto 1 3 \n", "15 parte [sf3fs] [parte] Canto 1 3 \n", "16 più [b] [più] Canto 1 3 \n", "17 e [cscc] [e] Canto 1 3 \n", "18 meno [b] [meno] Canto 1 3 \n", "19 altrove [b] [altrove] Canto 1 3 \n", "20 Nel [epaksl, rdms] [in, il] Canto 1 4 \n", "\n", " PosizioneFFNelVerso \n", "0 1 \n", "1 2 \n", "2 3 \n", "3 4 \n", "4 5 \n", "5 6 \n", "6 7 \n", "7 1 \n", "8 2 \n", "9 3 \n", "10 4 \n", "11 5 \n", "12 6 \n", "13 1 \n", "14 2 \n", "15 3 \n", "16 4 \n", "17 5 \n", "18 6 \n", "19 7 \n", "20 1 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testo_paradiso_tabella.head(21)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creazione di una tabella con gli IRI dei versi per la cantica Paradiso\n", "La tabella contiene il numero del verso, il verso e l'IRI del verso." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NumeroVerso 4757\n", "Verso 4757\n", "IRIVerso 4757\n", "dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_IRI_versi_par = [parsed_paradiso[1]]\n", "#data_IRI_versi\n", "df_IRI_versi_par=pd.DataFrame(data_IRI_versi_par[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n", "df_IRI_versi_par.count()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NumeroVerso Verso IRIVerso
01La gloria di colui che tutto movehttp://hdn.dantenetwork.it/resource/work/commedia/cantica/3/canto/1/verso/1\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"La gloria di colui che tutto move\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"1\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"1\"^^xsd:int .
12per l'universo penetra, e risplendehttp://hdn.dantenetwork.it/resource/work/commedia/cantica/3/canto/1/verso/2\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"per l'universo penetra, e risplende\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"2\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"2\"^^xsd:int .
23in una parte più e meno altrove.http://hdn.dantenetwork.it/resource/work/commedia/cantica/3/canto/1/verso/3\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"in una parte più e meno altrove.\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"3\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"3\"^^xsd:int .
34Nel ciel che più de la sua luce prendehttp://hdn.dantenetwork.it/resource/work/commedia/cantica/3/canto/1/verso/4\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"Nel ciel che più de la sua luce prende\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"4\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"4\"^^xsd:int .
45fu' io, e vidi cose che ridirehttp://hdn.dantenetwork.it/resource/work/commedia/cantica/3/canto/1/verso/5\n", " a efrbroo:F2_Expression ,\n", " rdfs:Resource ; \n", "http://erlangen-crm.org/current/P190_has_symbolic_content \"fu' io, e vidi cose che ridire\"^^xsd:string ;\n", " http://erlangen-crm.org/current/P3_has_note \"5\"^^xsd:int ;\n", " http://hdn.dantenetwork.it/resource/has_number \"5\"^^xsd:int .
" ], "text/plain": [ "" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_IRI_versi_par.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }