sshoc-skosmapping/Progetto_Lett.ipynb

1071 lines
41 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test per Parsing e generazione IRI"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"# importing useful Python utility libraries we'll need\n",
"from collections import Counter, defaultdict\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def read_tei(tei_file):\n",
" with open(tei_file, 'r') as tei:\n",
" soup = BeautifulSoup(tei, 'lxml')\n",
" return soup\n",
" raise RuntimeError('Cannot generate a soup from the input')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def elem_to_text(elem, default=''):\n",
" if elem:\n",
" return elem.getText(separator=' ', strip=True)\n",
" else:\n",
" return default"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"\n",
"@dataclass\n",
"class Person:\n",
" firstname: str\n",
" middlename: str\n",
" surname: str"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parser\n",
"\n",
"Provo a creare un parser.\n",
"\n",
"Un estratto dal file inferno.xml:\n",
"\n",
"~~~~\n",
"<div1> <head>Canto 1</head>\n",
"<lg type=\"canto\">\n",
" <l>\n",
" <LM lemma=\"il\" catg=\"rdms\">Nel</LM>\n",
" <LM lemma=\"in mezzo di\" catg=\"eilaksl\">mezzo</LM>\n",
" <LM lemma=\"il\" catg=\"rdms\">del</LM>\n",
" <LM lemma=\"cammino\" catg=\"sm2ms\">cammin</LM>\n",
" <LM lemma=\"di\" catg=\"epskg\">di</LM>\n",
" <LM lemma=\"nostro\" catg=\"as1fs\">nostra</LM>\n",
" <LM lemma=\"vita\" catg=\"sf1fs\">vita</LM>\n",
" </l>\n",
" ...\n",
" ...\n",
" <l>\n",
" <LM lemma=\"che\" catg=\"pr\">che</LM>\n",
" <LM1>\n",
" \t <LM lemma=\"il\" catg=\"rdms\">nel</LM> \n",
" \t <LM lemma=\"in\" catg=\"epaksl\">nel</LM>\n",
" </LM1>\n",
" <LM lemma=\"pensiero\" catg=\"sm2ms\">pensier</LM>\n",
" <LM lemma=\"rinnovare\" catg=\"vta1ips3\">rinova</LM>\n",
" <LM lemma=\"la\" catg=\"rdfs\">la</LM>\n",
" <LM lemma=\"paura\" catg=\"sf1fs\">paura</LM>!\n",
" </l>\n",
" <l>\n",
" ...\n",
"~~~~\n",
"\n",
" \n",
"Il tag \\<div1\\> individua la porzione di file di un *Canto*, il tag \\<l\\> individua un verso, il tag \\<LM\\> individua una *forma flessa*, ciascuna forma flessa ha 1 o 2 attributi.\n",
"All'interno di un verso può essere presente il tag \\<LM1\\> che ha come content più elementi \\<LM\\>, ciascuno di essi contiene la stessa forma flessa ma differenti valori per gli attributi 'catg' e 'lemma'.\n",
"\n",
"per questa implementazione uso la libreria Python [Beatiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)."
]
},
{
"cell_type": "code",
"execution_count": 269,
"metadata": {},
"outputs": [],
"source": [
"class TEIFile(object):\n",
" def __init__(self, filename, idres=0):\n",
" self.filename = filename\n",
" self.soup = read_tei(filename)\n",
" self._text = None\n",
" self.idres=idres;\n",
" # self._lemmas = None\n",
" # self._lemma_lemmas = None\n",
" # self._categ_lemmas = None\n",
" self._title = ''\n",
" self._abstract = ''\n",
"\n",
" \n",
" @property\n",
" def title(self):\n",
" if not self._title:\n",
" if not self.soup.title:\n",
" self._title = \"na\"\n",
" else:\n",
" self._title = self.soup.title.getText().replace('\\n','').strip()\n",
" return self._title\n",
"\n",
" \n",
" @property\n",
" def authors(self):\n",
" #authors_in_header = self.soup.analytic.find_all('author')\n",
" authors_in_header = self.soup.find_all('author')\n",
"\n",
" result = []\n",
" for author in authors_in_header:\n",
" persname = author.persname\n",
" if not persname:\n",
" continue\n",
" firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n",
" middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n",
" surname = elem_to_text(persname.surname)\n",
" person = Person(firstname, middlename, surname)\n",
" result.append(person)\n",
" return result\n",
" \n",
" @property\n",
" def bibliography(self):\n",
" bibliography = self.soup.find_all('bibl')\n",
" result = []\n",
" for bibl in bibliography:\n",
" if not bibl:\n",
" continue\n",
" #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n",
" # continue\n",
" my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n",
" my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n",
" result.append(\" \".join(my_bibl_tmp.split()))\n",
" return result\n",
"\n",
"\n",
" @property\n",
" def text(self):\n",
" if not self._text:\n",
" divs_text = []\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" # div is neither an appendix nor references, just plain text.\n",
" if not div.get(\"type\"):\n",
" div_text = div.get_text(separator=' ', strip=True)\n",
" divs_text.append(div_text)\n",
"\n",
" plain_text = \" \".join(divs_text)\n",
" self._text = plain_text\n",
" return self._text\n",
" \n",
" @property\n",
" def orderedlemma(self):\n",
" ordr_lms = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" j=0;\n",
" for lm in verso.find_all(\"lm\"):\n",
" lstctg=[];\n",
" lstlms=[];\n",
" j=j+1;\n",
" lm_text=elem_to_text(lm).strip();\n",
" #ctg=lm.get('catg');\n",
" if (lm.get('catg')!=None):\n",
" ctg=lm.get('catg');\n",
" else:\n",
" ctg=\"non_spec\";\n",
" \n",
" lstctg.append(\" \".join(ctg.split())); \n",
" \n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" lstlms.append(\" \".join(lemma.split())); \n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0];\n",
" if (parent.name=='lm1' and ordr_lms[-1][0]==\" \".join(lm_text.split())):\n",
" j=j-1;\n",
" lstctg=lstctg+ordr_lms[-1][1];\n",
" lstlms=lstlms+ordr_lms[-1][2];\n",
" ordr_lms.pop();\n",
" \n",
" ordr_lms.append((\" \".join(lm_text.split()), lstctg, lstlms, canto.replace('\\n','').strip(), i, j));\n",
" \n",
" \n",
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
" \n",
" \n",
" return ordr_lms\n",
" #IRI del verso\n",
" @property\n",
" def IRIverso(self):\n",
" iris = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" verso_text=elem_to_text(verso).strip();\n",
" for vparent in verso.parents:\n",
" if (vparent.name=='div1'):\n",
" canto = vparent.contents[0];\n",
" iri_verso=\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
" iri_verso=iri_verso+'\\n a efrbroo:F2_Expression ,\\n rdfs:Resource ; \\nhttp://erlangen-crm.org/current/P190_has_symbolic_content \"';\n",
" iri_verso=iri_verso+(\" \".join(verso_text.split())).strip()+ '\\\"^^xsd:string ;\\n http://erlangen-crm.org/current/P3_has_note';\n",
" iri_verso=iri_verso+' \"'+str(i)+'\"^^xsd:int ;\\n http://hdn.dantenetwork.it/resource/has_number \"'+str(i)+'\"^^xsd:int .'\n",
" iris.append((i, \" \".join(verso_text.split()), iri_verso));\n",
" \n",
" \n",
" return iris\n",
" \n",
" \n",
" #test\n",
" @property\n",
" def ff_ea(self):\n",
" lms_text = []\n",
" lms_tupl=()\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" lm_text=elem_to_text(lm).strip()\n",
" ctg=lm.get('catg');\n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0]\n",
" break;\n",
" lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n",
" return lms_text\n",
" \n",
" @property\n",
" def categ_lemma(self):\n",
" ctgs_text = []\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" ctg_text=lm.get('catg').strip();\n",
" ctgs_text.append(\" \".join(ctg_text.split()))\n",
" return ctgs_text\n",
" \n",
" @property\n",
" def lemma_lemma(self):\n",
" lemmas_text = []\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" if (lm.get('lemma')):\n",
" lemma_text=lm.get('lemma').strip();\n",
" else:\n",
" lemma_text='non_spec';\n",
" lemmas_text.append(\" \".join(lemma_text.split()))\n",
" return lemmas_text"
]
},
{
"cell_type": "code",
"execution_count": 235,
"metadata": {},
"outputs": [],
"source": [
"def tei_to_csv_entry(tei_file, idres=0):\n",
" tei = TEIFile(tei_file, idres)\n",
" print(f\"Handled {tei_file}\")\n",
" base_name = tei_file\n",
" return tei.orderedlemma, tei.IRIverso, tei.categ_lemma, tei.lemma_lemma #, tei.abstract"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Provo a vedere se il parser funziona\n",
"Dovrebbe arrivare sino al termine 'oscuro', controllare!"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Nel', 'rdms', 'il', 'Canto 1')\n",
"\n",
"('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1')\n",
"\n",
"('del', 'rdms', 'il', 'Canto 1')\n",
"\n",
"('cammin', 'sm2ms', 'cammino', 'Canto 1')\n",
"\n",
"('di', 'epskg', 'di', 'Canto 1')\n",
"\n",
"('nostra', 'as1fs', 'nostro', 'Canto 1')\n",
"\n",
"('vita', 'sf1fs', 'vita', 'Canto 1')\n",
"\n",
"('mi', 'pf1sypr', 'mi', 'Canto 1')\n",
"\n",
"('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1')\n",
"\n",
"('per', 'epskpl', 'per', 'Canto 1')\n",
"\n",
"('una', 'rifs', 'una', 'Canto 1')\n",
"\n",
"('selva', 'sf1fs', 'selva', 'Canto 1')\n",
"\n",
"('oscura', 'a1fs', 'oscuro', 'Canto 1')\n",
"\n",
"...\n"
]
}
],
"source": [
"tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)\n",
"bbs=tei.ff_ea\n",
"for re in bbs:\n",
" print (re, end=\"\\n\"*2)\n",
" if (re[0].startswith('oscura')):\n",
" print('...')\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Carico il testo *inferno.xml* e creo una tabella\n",
"Eseguo il parsing del testo presente nel file e creo una tabella con le seguenti colonne: *forma flessa, categoria, lemma, canto, verso, pposizione forma flessa nel verso*"
]
},
{
"cell_type": "code",
"execution_count": 270,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml\n"
]
}
],
"source": [
"mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)"
]
},
{
"cell_type": "code",
"execution_count": 271,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"FormaFlessa 33400\n",
"Categoria 33400\n",
"Lemma 33400\n",
"Canto 33400\n",
"Verso 33400\n",
"PosizioneFFNelVerso 33400\n",
"dtype: int64"
]
},
"execution_count": 271,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = [mytesto[0]]\n",
"#data[0]\n",
"dfObj = pd.DataFrame(data[0]) \n",
"testo_tabella=pd.DataFrame(data[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
"testo_tabella.count()"
]
},
{
"cell_type": "code",
"execution_count": 272,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>FormaFlessa</th>\n",
" <th>Categoria</th>\n",
" <th>Lemma</th>\n",
" <th>Canto</th>\n",
" <th>Verso</th>\n",
" <th>PosizioneFFNelVerso</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nel</td>\n",
" <td>[rdms]</td>\n",
" <td>[il]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>mezzo</td>\n",
" <td>[eilaksl]</td>\n",
" <td>[in mezzo di]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>del</td>\n",
" <td>[rdms]</td>\n",
" <td>[il]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cammin</td>\n",
" <td>[sm2ms]</td>\n",
" <td>[cammino]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>di</td>\n",
" <td>[epskg]</td>\n",
" <td>[di]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>nostra</td>\n",
" <td>[as1fs]</td>\n",
" <td>[nostro]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>vita</td>\n",
" <td>[sf1fs]</td>\n",
" <td>[vita]</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>mi</td>\n",
" <td>[pf1sypr]</td>\n",
" <td>[mi]</td>\n",
" <td>Canto 1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>ritrovai</td>\n",
" <td>[vta+1irs1]</td>\n",
" <td>[ritrovare]</td>\n",
" <td>Canto 1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>per</td>\n",
" <td>[epskpl]</td>\n",
" <td>[per]</td>\n",
" <td>Canto 1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" FormaFlessa Categoria Lemma Canto Verso PosizioneFFNelVerso\n",
"0 Nel [rdms] [il] Canto 1 1 1\n",
"1 mezzo [eilaksl] [in mezzo di] Canto 1 1 2\n",
"2 del [rdms] [il] Canto 1 1 3\n",
"3 cammin [sm2ms] [cammino] Canto 1 1 4\n",
"4 di [epskg] [di] Canto 1 1 5\n",
"5 nostra [as1fs] [nostro] Canto 1 1 6\n",
"6 vita [sf1fs] [vita] Canto 1 1 7\n",
"7 mi [pf1sypr] [mi] Canto 1 2 1\n",
"8 ritrovai [vta+1irs1] [ritrovare] Canto 1 2 2\n",
"9 per [epskpl] [per] Canto 1 2 3"
]
},
"execution_count": 272,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testo_tabella.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generiamo una tabella con gli IRI dei versi per la cantica *Inferno*\n",
"\n",
"La abella contiene il numero del verso, il verso e l'IRI del verso. \n",
"Per l'IRI del verso mi son basato su quanto riportato nel file *Commedia.rdf*, un esempio è il seguente: \n",
"\n",
"> <http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/9/verso/106> \n",
"> a efrbroo:F2_Expression , rdfs:Resource ; \n",
"> <http://erlangen-crm.org/current/P190_has_symbolic_content> \n",
"> \"Per li tre gradi sù di buona voglia\"^^xsd:string ; \n",
"> <http://erlangen-crm.org/current/P3_has_note> \n",
"> \"106\"^^xsd:int ; \n",
"> <http://hdn.dantenetwork.it/resource/has_number> \n",
"> \"106\"^^xsd:int . \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 280,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NumeroVerso 4721\n",
"Verso 4721\n",
"IRIVerso 4721\n",
"dtype: int64"
]
},
"execution_count": 280,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_IRI_versi_inf = [mytesto[1]]\n",
"#data_IRI_versi\n",
"df_IRI_versi_inf=pd.DataFrame(data_IRI_versi_inf[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
"df_IRI_versi_inf.count()"
]
},
{
"cell_type": "code",
"execution_count": 282,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\" >\n",
"#T_07283f2a_66d2_11eb_8bb0_60f81dca6224row0_col2,#T_07283f2a_66d2_11eb_8bb0_60f81dca6224row1_col2,#T_07283f2a_66d2_11eb_8bb0_60f81dca6224row2_col2,#T_07283f2a_66d2_11eb_8bb0_60f81dca6224row3_col2,#T_07283f2a_66d2_11eb_8bb0_60f81dca6224row4_col2{\n",
" width: 400px;\n",
" }</style><table id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >NumeroVerso</th> <th class=\"col_heading level0 col1\" >Verso</th> <th class=\"col_heading level0 col2\" >IRIVerso</th> </tr></thead><tbody>\n",
" <tr>\n",
" <th id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row0_col0\" class=\"data row0 col0\" >1</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row0_col1\" class=\"data row0 col1\" >Nel mezzo del cammin di nostra vita</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row0_col2\" class=\"data row0 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/1\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"Nel mezzo del cammin di nostra vita\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"1\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"1\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row1_col0\" class=\"data row1 col0\" >2</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row1_col1\" class=\"data row1 col1\" >mi ritrovai per una selva oscura</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row1_col2\" class=\"data row1 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/2\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"mi ritrovai per una selva oscura\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"2\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"2\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row2_col0\" class=\"data row2 col0\" >3</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row2_col1\" class=\"data row2 col1\" >ché la diritta via era smarrita .</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row2_col2\" class=\"data row2 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/3\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"ché la diritta via era smarrita .\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"3\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"3\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row3_col0\" class=\"data row3 col0\" >4</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row3_col1\" class=\"data row3 col1\" >Ahi quanto a dir qual era è cosa dura</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row3_col2\" class=\"data row3 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/4\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"Ahi quanto a dir qual era è cosa dura\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"4\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"4\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row4_col0\" class=\"data row4 col0\" >5</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row4_col1\" class=\"data row4 col1\" >esta selva selvaggia e aspra e forte</td>\n",
" <td id=\"T_07283f2a_66d2_11eb_8bb0_60f81dca6224row4_col2\" class=\"data row4 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/1/canto/1/verso/5\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"esta selva selvaggia e aspra e forte\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"5\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"5\"^^xsd:int .</td>\n",
" </tr>\n",
" </tbody></table>"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x14efc2a10>"
]
},
"execution_count": 282,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_IRI_versi_inf.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### File *purgatorio.xml*\n",
"\n",
"Test, ignorare.\n"
]
},
{
"cell_type": "code",
"execution_count": 276,
"metadata": {},
"outputs": [],
"source": [
"#tei_purgatorio = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)\n",
"#bbs_pu=tei_purgatorio.IRIverso\n",
"#for repu in bbs_pu:\n",
"# print (repu, end=\"\\n\"*2)\n",
"# if (repu[0].startswith('che')):\n",
"# print('...')\n",
"# break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Carico il testo *purgatorio.xml* e creo una tabella\n",
"Eseguo il parsing del testo presente nel file e creo una tabella simile alla precedente\n",
"\\<LM1\\> forma flessa con due lemmi, gestire nel parser"
]
},
{
"cell_type": "code",
"execution_count": 277,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml\n"
]
}
],
"source": [
"parsed_purgatorio=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"FormaFlessa 33245\n",
"Categoria 33245\n",
"Lemma 33245\n",
"Canto 33245\n",
"Verso 33245\n",
"PosizioneFFNelVerso 33245\n",
"dtype: int64"
]
},
"execution_count": 278,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#DA COMPLETARE CON IRI CORRETTO!\n",
"\n",
"data_purgatorio = [parsed_purgatorio[0]]\n",
"#dfObj_purgatorio = pd.DataFrame(data_purgatorio[0]) \n",
"testo_purgatorio_tabella=pd.DataFrame(data_purgatorio[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
"testo_purgatorio_tabella.count()"
]
},
{
"cell_type": "code",
"execution_count": 279,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>FormaFlessa</th>\n",
" <th>Categoria</th>\n",
" <th>Lemma</th>\n",
" <th>Canto</th>\n",
" <th>Verso</th>\n",
" <th>PosizioneFFNelVerso</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>33240</th>\n",
" <td>disposto</td>\n",
" <td>[vtp2pra1ms]</td>\n",
" <td>[disporre]</td>\n",
" <td>Canto 33</td>\n",
" <td>4755</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33241</th>\n",
" <td>a</td>\n",
" <td>[epsb]</td>\n",
" <td>[a]</td>\n",
" <td>Canto 33</td>\n",
" <td>4755</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33242</th>\n",
" <td>salire</td>\n",
" <td>[vi3fp]</td>\n",
" <td>[salire]</td>\n",
" <td>Canto 33</td>\n",
" <td>4755</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33243</th>\n",
" <td>alle</td>\n",
" <td>[rdfp, epakml]</td>\n",
" <td>[la, a]</td>\n",
" <td>Canto 33</td>\n",
" <td>4755</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33244</th>\n",
" <td>stelle</td>\n",
" <td>[sf1fp]</td>\n",
" <td>[stella]</td>\n",
" <td>Canto 33</td>\n",
" <td>4755</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" FormaFlessa Categoria Lemma Canto Verso \\\n",
"33240 disposto [vtp2pra1ms] [disporre] Canto 33 4755 \n",
"33241 a [epsb] [a] Canto 33 4755 \n",
"33242 salire [vi3fp] [salire] Canto 33 4755 \n",
"33243 alle [rdfp, epakml] [la, a] Canto 33 4755 \n",
"33244 stelle [sf1fp] [stella] Canto 33 4755 \n",
"\n",
" PosizioneFFNelVerso \n",
"33240 3 \n",
"33241 4 \n",
"33242 5 \n",
"33243 6 \n",
"33244 7 "
]
},
"execution_count": 279,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testo_purgatorio_tabella.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generiamo una tabella con gli IRI dei versi per la cantica *Purgatorio*\n",
"\n",
"La tabella contiene il numero del verso, il verso e l'IRI del verso. "
]
},
{
"cell_type": "code",
"execution_count": 284,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NumeroVerso 4755\n",
"Verso 4755\n",
"IRIVerso 4755\n",
"dtype: int64"
]
},
"execution_count": 284,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_IRI_versi_pur = [parsed_purgatorio[1]]\n",
"#data_IRI_versi\n",
"df_IRI_versi_pur=pd.DataFrame(data_IRI_versi_pur[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
"df_IRI_versi_pur.count()"
]
},
{
"cell_type": "code",
"execution_count": 285,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\" >\n",
"#T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row0_col2,#T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row1_col2,#T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row2_col2,#T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row3_col2,#T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row4_col2{\n",
" width: 400px;\n",
" }</style><table id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224\" ><thead> <tr> <th class=\"blank level0\" ></th> <th class=\"col_heading level0 col0\" >NumeroVerso</th> <th class=\"col_heading level0 col1\" >Verso</th> <th class=\"col_heading level0 col2\" >IRIVerso</th> </tr></thead><tbody>\n",
" <tr>\n",
" <th id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row0_col0\" class=\"data row0 col0\" >1</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row0_col1\" class=\"data row0 col1\" >Per correr miglior acque alza le vele</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row0_col2\" class=\"data row0 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/1\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"Per correr miglior acque alza le vele\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"1\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"1\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row1_col0\" class=\"data row1 col0\" >2</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row1_col1\" class=\"data row1 col1\" >omai la navicella del del mio ingegno ,</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row1_col2\" class=\"data row1 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/2\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"omai la navicella del del mio ingegno ,\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"2\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"2\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row2_col0\" class=\"data row2 col0\" >3</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row2_col1\" class=\"data row2 col1\" >che lascia dietro a sé mar sì crudele;</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row2_col2\" class=\"data row2 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/3\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"che lascia dietro a sé mar sì crudele;\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"3\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"3\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row3_col0\" class=\"data row3 col0\" >4</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row3_col1\" class=\"data row3 col1\" >e canterò di quel secondo regno</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row3_col2\" class=\"data row3 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/4\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"e canterò di quel secondo regno\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"4\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"4\"^^xsd:int .</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row4_col0\" class=\"data row4 col0\" >5</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row4_col1\" class=\"data row4 col1\" >dove l' umano spirito si purga</td>\n",
" <td id=\"T_b043fcfc_66d2_11eb_8bb0_60f81dca6224row4_col2\" class=\"data row4 col2\" >http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/1/verso/5\n",
" a efrbroo:F2_Expression ,\n",
" rdfs:Resource ; \n",
"http://erlangen-crm.org/current/P190_has_symbolic_content \"dove l' umano spirito si purga\"^^xsd:string ;\n",
" http://erlangen-crm.org/current/P3_has_note \"5\"^^xsd:int ;\n",
" http://hdn.dantenetwork.it/resource/has_number \"5\"^^xsd:int .</td>\n",
" </tr>\n",
" </tbody></table>"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x14cdefe10>"
]
},
"execution_count": 285,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_IRI_versi_pur.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}