diff --git a/TAPoRCheck.ipynb b/TAPoRCheck.ipynb new file mode 100644 index 0000000..5bbb3cf --- /dev/null +++ b/TAPoRCheck.ipynb @@ -0,0 +1,3408 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Review of data ingested from TAPoR (draft)\n", + "\n", + "This is document cheks the TAPoR dataset using the python library Pandas.\n", + "\n", + "Reference to ticket: https://gitlab.gwdg.de/sshoc/data-ingestion/-/issues/7\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preamble" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from bokeh.io import output_notebook, show\n", + "from bokeh.plotting import figure\n", + "\n", + "from im_tutorials.data import *\n", + "from im_tutorials.utilities import flatten_lists\n", + "from im_tutorials.features.text_preprocessing import *\n", + "from im_tutorials.features.document_vectors import document_vector\n", + "from im_tutorials.features.dim_reduction import WrapTSNE, GaussianMixtureEval\n", + "# for db\n", + "import sqlalchemy as db\n", + "from sqlalchemy import *" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "engine = create_engine(\n", + " \"connection_string\")\n", + "connection = engine.connect()\n", + "metadata = db.MetaData()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query the DB to get TAPoR data\n", + "\n", + "The TAPoR dataset used in this document is the sql dump published by Education and Research Archive (ERA) University of Alberta: \n", + "\n", + "https://era.library.ualberta.ca/items/f2da0666-f523-44d4-a83c-fa06351a1e94 \n", + "\n", + "(creation date: 2020-01-01).\n", + "The table *tool* contains 1504 records, each one describing a tool. \n", + "Records have been filtered according the value of the field *tool.is_approved*, there are 1363 *approved* records.\n", + "In this document this dataset will be called the **TAPoR dataset**.\n", + "\n", + "*Note that the TAPoR dataset reviewed here is not the same that has been used for the MP ingestion, this document will be update when we'll have it*\n" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=1363, step=1)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_db_tools=pd.read_sql_query('SELECT * FROM TaPOR.tools where is_approved=1 order by last_updated', connection)\n", + "df_db_tools.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### An example of TAPoR item\n", + "Let's take a look at a random TAPoR dataset entry.\n", + "(The database schema of the TAPoR dataset is described here: https://era.library.ualberta.ca/items/f2da0666-f523-44d4-a83c-fa06351a1e94/download/8057eae2-3fae-4afa-bc8e-6dcc2a257b6f.)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 254\n", + "user_id NaN\n", + "name TextQuest\n", + "detail
TextQuest is a text analysis program availa...\n", + "url http://www.textquest.de/pages/en/general-infor...\n", + "is_approved 1\n", + "creators_name Social Science Consulting\n", + "creators_email info@textquest.de\n", + "creators_url http://www.textquest.de/\n", + "image_url images/tools/0/254.png\n", + "star_average 0\n", + "is_hidden 0\n", + "last_updated 2013-05-13\n", + "documentation_url http://www.textquest.de/pages/en/analysis-of-t...\n", + "code None\n", + "repository \n", + "language NaN\n", + "nature 0\n", + "created_at 2013-05-13 18:57:27\n", + "updated_at 2017-10-31 14:25:28\n", + "recipes \n", + "Name: 500, dtype: object" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#df_db_tools.dtypes\n", + "df_db_tools.iloc[500]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following table shows 5 records of the TAPoR dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "user_id | \n", + "name | \n", + "detail | \n", + "url | \n", + "is_approved | \n", + "creators_name | \n", + "creators_email | \n", + "creators_url | \n", + "image_url | \n", + "... | \n", + "is_hidden | \n", + "last_updated | \n", + "documentation_url | \n", + "code | \n", + "repository | \n", + "language | \n", + "nature | \n", + "created_at | \n", + "updated_at | \n", + "recipes | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
906 | \n", + "937 | \n", + "1.0 | \n", + "140kit | \n", + "<p>140kit provides a management layer for twee... | \n", + "https://github.com/WebEcologyProject/140kit | \n", + "1 | \n", + "Ian Pearce, Devin Gaffney | \n", + "None | \n", + "None | \n", + "images/tools/1/937.png | \n", + "... | \n", + "0 | \n", + "2018-10-05 | \n", + "None | \n", + "None | \n", + "None | \n", + "NaN | \n", + "0 | \n", + "2015-05-24 00:00:00 | \n", + "2018-10-05 04:43:34 | \n", + "\n", + " |
334 | \n", + "1229 | \n", + "1.0 | \n", + "3DVIA Virtools | \n", + "<p>A software tool for the creation of 3D inte... | \n", + "None | \n", + "1 | \n", + "Dassault Systemes | \n", + "None | \n", + "None | \n", + "None | \n", + "... | \n", + "0 | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "NaN | \n", + "0 | \n", + "2014-12-29 00:00:00 | \n", + "2014-12-29 00:00:00 | \n", + "\n", + " |
688 | \n", + "783 | \n", + "1.0 | \n", + "4th Dimension | \n", + "4th Dimension is a graphic environment for dev... | \n", + "http://www.4d.com/products/4d2004/4dstandarded... | \n", + "1 | \n", + "4D | \n", + "None | \n", + "http://www.4d.com/ | \n", + "images/tools/1/783.png | \n", + "... | \n", + "0 | \n", + "2018-09-18 | \n", + "None | \n", + "None | \n", + "None | \n", + "NaN | \n", + "0 | \n", + "2015-05-24 00:00:00 | \n", + "2018-09-18 20:39:31 | \n", + "\n", + " |
1156 | \n", + "648 | \n", + "937.0 | \n", + "80legs | \n", + "80legs is a web crawling service. You need to ... | \n", + "http://80legs.com/ | \n", + "1 | \n", + "80legs | \n", + "\n", + " | \n", + " | images/tools/1/648.png | \n", + "... | \n", + "0 | \n", + "2018-10-30 | \n", + "None | \n", + "None | \n", + "\n", + " | NaN | \n", + "0 | \n", + "2017-10-15 23:04:46 | \n", + "2018-10-30 16:03:45 | \n", + "\n", + " |
770 | \n", + "1454 | \n", + "1.0 | \n", + "960 Grid System | \n", + "<p>960 Grid System is a CSS template that come... | \n", + "https://960.gs/ | \n", + "1 | \n", + "Nathan Smith | \n", + "None | \n", + "http://sonspring.com/ | \n", + "images/tools/2/1454.png | \n", + "... | \n", + "0 | \n", + "2018-09-27 | \n", + "None | \n", + "None | \n", + "https://github.com/nathansmith/960-Grid-System | \n", + "NaN | \n", + "0 | \n", + "2014-12-29 00:00:00 | \n", + "2018-09-27 22:29:43 | \n", + "\n", + " |
5 rows × 21 columns
\n", + "140kit provides a management layer for twee... \n", + "334
A software tool for the creation of 3D inte... \n", + "688 4th Dimension is a graphic environment for dev... \n", + "1156 80legs is a web crawling service. You need to ... \n", + "770
960 Grid System is a CSS template that come... \n", + "\n", + " url is_approved \\\n", + "906 https://github.com/WebEcologyProject/140kit 1 \n", + "334 None 1 \n", + "688 http://www.4d.com/products/4d2004/4dstandarded... 1 \n", + "1156 http://80legs.com/ 1 \n", + "770 https://960.gs/ 1 \n", + "\n", + " creators_name creators_email creators_url \\\n", + "906 Ian Pearce, Devin Gaffney None None \n", + "334 Dassault Systemes None None \n", + "688 4D None http://www.4d.com/ \n", + "1156 80legs \n", + "770 Nathan Smith None http://sonspring.com/ \n", + "\n", + " image_url ... is_hidden last_updated documentation_url \\\n", + "906 images/tools/1/937.png ... 0 2018-10-05 None \n", + "334 None ... 0 None None \n", + "688 images/tools/1/783.png ... 0 2018-09-18 None \n", + "1156 images/tools/1/648.png ... 0 2018-10-30 None \n", + "770 images/tools/2/1454.png ... 0 2018-09-27 None \n", + "\n", + " code repository language nature \\\n", + "906 None None NaN 0 \n", + "334 None None NaN 0 \n", + "688 None None NaN 0 \n", + "1156 None NaN 0 \n", + "770 None https://github.com/nathansmith/960-Grid-System NaN 0 \n", + "\n", + " created_at updated_at recipes \n", + "906 2015-05-24 00:00:00 2018-10-05 04:43:34 \n", + "334 2014-12-29 00:00:00 2014-12-29 00:00:00 \n", + "688 2015-05-24 00:00:00 2018-09-18 20:39:31 \n", + "1156 2017-10-15 23:04:46 2018-10-30 16:03:45 \n", + "770 2014-12-29 00:00:00 2018-09-27 22:29:43 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_db_tools.sort_values('name').head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check for duplicates in TAPoR dataset\n", + "Considering the values for 'name' and 'url', it appears that in the TAPoR dataset there are 4 duplicated descriptions" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "user_id | \n", + "name | \n", + "detail | \n", + "url | \n", + "is_approved | \n", + "creators_name | \n", + "creators_email | \n", + "creators_url | \n", + "image_url | \n", + "... | \n", + "is_hidden | \n", + "last_updated | \n", + "documentation_url | \n", + "code | \n", + "repository | \n", + "language | \n", + "nature | \n", + "created_at | \n", + "updated_at | \n", + "recipes | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1358 | \n", + "148 | \n", + "NaN | \n", + "AntConc | \n", + "AntConc is free concordance software. It is mu... | \n", + "http://www.laurenceanthony.net/software/antconc/ | \n", + "1 | \n", + "Laurence Anthony | \n", + "anthony@waseda.jp | \n", + "http://www.antlab.sci.waseda.ac.jp/index.html | \n", + "images/tools/0/148.png | \n", + "... | \n", + "0 | \n", + "2019-08-19 | \n", + "http://www.laurenceanthony.net/software/antcon... | \n", + "None | \n", + "\n", + " | NaN | \n", + "0 | \n", + "2012-07-30 18:25:44 | \n", + "2019-08-19 00:37:45 | \n", + "\n", + " |
1362 | \n", + "1565 | \n", + "1201.0 | \n", + "SentiStrength | \n", + "SentiStrength is a sentiment analysis (opinion... | \n", + "http://sentistrength.wlv.ac.uk/ | \n", + "1 | \n", + "Mike Thelwall | \n", + "m.thelwall@wlv.ac.uk | \n", + "http://sentistrength.wlv.ac.uk | \n", + "images/tools/3/1565.png | \n", + "... | \n", + "0 | \n", + "2019-09-27 | \n", + "None | \n", + "None | \n", + "\n", + " | NaN | \n", + "0 | \n", + "2019-09-20 05:03:47 | \n", + "2019-09-27 10:03:35 | \n", + "\n", + " |
652 | \n", + "580 | \n", + "937.0 | \n", + "Voyant 2.0: Knots | \n", + "Voyant Knots is a visualization where a line i... | \n", + "http://voyant-tools.org/?view=knots | \n", + "1 | \n", + "Stéfan Sinclair and Geoffrey Rockwell | \n", + "stefan.sinclair@mcgill.ca | \n", + "http://stefansinclair.name/ | \n", + "images/tools/1/580.png | \n", + "... | \n", + "1 | \n", + "2016-04-29 | \n", + "None | \n", + "None | \n", + "\n", + " | NaN | \n", + "0 | \n", + "2016-04-29 16:08:28 | \n", + "2017-10-31 14:26:36 | \n", + "\n", + " |
653 | \n", + "581 | \n", + "937.0 | \n", + "Voyant 2.0: Knots | \n", + "Voyant Knots is a visualization where a line i... | \n", + "http://voyant-tools.org/?view=knots | \n", + "1 | \n", + "Stéfan Sinclair and Geoffrey Rockwell | \n", + "stefan.sinclair@mcgill.ca | \n", + "http://stefansinclair.name/ | \n", + "images/tools/1/581.png | \n", + "... | \n", + "0 | \n", + "2016-04-29 | \n", + "None | \n", + "None | \n", + "\n", + " | NaN | \n", + "0 | \n", + "2016-04-29 16:11:55 | \n", + "2017-10-31 14:26:36 | \n", + "\n", + " |
4 rows × 21 columns
\n", + "\n", + " | id | \n", + "category | \n", + "label | \n", + "version | \n", + "description | \n", + "licenses | \n", + "contributors | \n", + "properties | \n", + "accessibleAt | \n", + "sourceItemId | \n", + "... | \n", + "status | \n", + "comments | \n", + "olderVersions | \n", + "newerVersions | \n", + "repository | \n", + "source.id | \n", + "source.label | \n", + "source.url | \n", + "source.urlTemplate | \n", + "source | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
326 | \n", + "335 | \n", + "tool | \n", + "EVI-LINHD | \n", + "None | \n", + "EVI-LINHD is a free and open-source cloud plat... | \n", + "[] | \n", + "[{'actor': {'id': 275, 'name': 'Elena González... | \n", + "[{'id': 2702, 'type': {'code': 'thumbnail', 'l... | \n", + "http://www.evilinhd.com/ | \n", + "594 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
532 | \n", + "776 | \n", + "tool | \n", + "JSAN | \n", + "None | \n", + "The Integrated JStylo and Anonymouth Package. ... | \n", + "[] | \n", + "[{'actor': {'id': 493, 'name': '18th Connect',... | \n", + "[{'id': 7310, 'type': {'code': 'thumbnail', 'l... | \n", + "https://github.com/psal/jstylo | \n", + "1559 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
533 | \n", + "451 | \n", + "tool | \n", + "JSAN | \n", + "None | \n", + "The Integrated JStylo and Anonymouth Package. ... | \n", + "[] | \n", + "[{'actor': {'id': 493, 'name': '18th Connect',... | \n", + "[{'id': 4037, 'type': {'code': 'keyword', 'lab... | \n", + "https://github.com/psal/jstylo | \n", + "1557 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
697 | \n", + "1186 | \n", + "tool | \n", + "NodeXL | \n", + "None | \n", + "NodeXL is a free, open source tool for generat... | \n", + "[] | \n", + "[{'actor': {'id': 832, 'name': 'M. Smith, N. M... | \n", + "[{'id': 11766, 'type': {'code': 'license-type'... | \n", + "http://nodexl.codeplex.com/ | \n", + "482 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
854 | \n", + "560 | \n", + "tool | \n", + "Python Tools for Text-Analysis | \n", + "None | \n", + "This is a set of simple, free tools for analyz... | \n", + "[] | \n", + "[{'actor': {'id': 424, 'name': 'David L. Hoove... | \n", + "[{'id': 5060, 'type': {'code': 'thumbnail', 'l... | \n", + "https://wp.nyu.edu/exceltextanalysis/python_to... | \n", + "1507 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
947 | \n", + "1136 | \n", + "tool | \n", + "SentiStrength | \n", + "None | \n", + "SentiStrength is a tool for sentiment analysis... | \n", + "[] | \n", + "[{'actor': {'id': 799, 'name': 'Thelwall, M., ... | \n", + "[{'id': 11290, 'type': {'code': 'keyword', 'la... | \n", + "http://sentistrength.wlv.ac.uk/ | \n", + "453 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
948 | \n", + "378 | \n", + "tool | \n", + "SentiStrength | \n", + "None | \n", + "It is a sentiment analysis program. Automatic ... | \n", + "[] | \n", + "[{'actor': {'id': 493, 'name': '18th Connect',... | \n", + "[{'id': 3210, 'type': {'code': 'thumbnail', 'l... | \n", + "http://sentistrength.wlv.ac.uk/ | \n", + "1564 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
1187 | \n", + "607 | \n", + "tool | \n", + "UCINET | \n", + "None | \n", + "UCINET is a social media analysis set for soft... | \n", + "[] | \n", + "[{'actor': {'id': 459, 'name': 'Borgatti, S.P.... | \n", + "[{'id': 5501, 'type': {'code': 'tadirah-method... | \n", + "https://sites.google.com/site/ucinetsoftware/home | \n", + "576 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
476 | \n", + "165 | \n", + "tool | \n", + "igraph | \n", + "None | \n", + "igraph is an open source collection of network... | \n", + "[] | \n", + "[{'actor': {'id': 147, 'name': 'Gábor Csárdi, ... | \n", + "[{'id': 771, 'type': {'code': 'tadirah-methods... | \n", + "http://igraph.org/ | \n", + "623 | \n", + "... | \n", + "ingested | \n", + "[] | \n", + "[] | \n", + "[] | \n", + "None | \n", + "1.0 | \n", + "TAPoR | \n", + "http://tapor.ca | \n", + "http://tapor.ca/tools/{source-item-id} | \n", + "NaN | \n", + "
9 rows × 23 columns
\n", + "\n", + " | name | \n", + "url | \n", + "_merge | \n", + "
---|---|---|---|
0 | \n", + "140kit | \n", + "https://github.com/WebEcologyProject/140kit | \n", + "both | \n", + "
1 | \n", + "3DVIA Virtools | \n", + "\n", + " | both | \n", + "
2 | \n", + "4th Dimension | \n", + "http://www.4d.com/products/4d2004/4dstandarded... | \n", + "both | \n", + "
3 | \n", + "80legs | \n", + "http://80legs.com/ | \n", + "both | \n", + "
4 | \n", + "960 Grid System | \n", + "https://960.gs/ | \n", + "both | \n", + "
\n", + " | name | \n", + "url | \n", + "_merge | \n", + "
---|---|---|---|
142 | \n", + "CONDOR | \n", + "http://www.ickn.org/ckntools.html | \n", + "left_only | \n", + "
144 | \n", + "CQPweb | \n", + "https://cqpweb.lancs.ac.uk/ | \n", + "left_only | \n", + "
146 | \n", + "CSV Sort | \n", + "https://bitbucket.org/richardpenman/csvsort | \n", + "left_only | \n", + "
156 | \n", + "CasualConc | \n", + "https://sites.google.com/site/casualconc/ | \n", + "left_only | \n", + "
161 | \n", + "Chartle | \n", + "\n", + " | left_only | \n", + "
163 | \n", + "Chorus | \n", + "http://chorusanalytics.co.uk/ | \n", + "left_only | \n", + "
165 | \n", + "Chronos Timeline | \n", + "http://hyperstudio.mit.edu/software/chronos-ti... | \n", + "left_only | \n", + "
180 | \n", + "Code Bubbles | \n", + "http://cs.brown.edu/~spr/codebubbles/ | \n", + "left_only | \n", + "
184 | \n", + "Colaboratory | \n", + "https://colab.research.google.com/notebooks/we... | \n", + "left_only | \n", + "
214 | \n", + "ContaWords | \n", + "http://contawords.iula.upf.edu/ | \n", + "left_only | \n", + "
215 | \n", + "Contropedia | \n", + "http://contropedia.net/ | \n", + "left_only | \n", + "
220 | \n", + "Cowo | \n", + "https://github.com/seinecle/Cowo/blob/master/R... | \n", + "left_only | \n", + "
223 | \n", + "Critic Markup | \n", + "http://criticmarkup.com/ | \n", + "left_only | \n", + "
228 | \n", + "Cytoscape | \n", + "http://www.cytoscape.org/ | \n", + "left_only | \n", + "
254 | \n", + "Density Design - Knot | \n", + "http://www.densitydesign.org/research/knot/ | \n", + "left_only | \n", + "
255 | \n", + "DfR Browser | \n", + "https://agoldst.github.io/dfr-browser/ | \n", + "left_only | \n", + "
300 | \n", + "EVI-LINHD | \n", + "http://www.evilinhd.com/ | \n", + "left_only | \n", + "
307 | \n", + "EgoWeb 2.0 | \n", + "http://www.rand.org/methods/egoweb.html | \n", + "left_only | \n", + "
332 | \n", + "Facepager | \n", + "https://github.com/strohne/Facepager | \n", + "left_only | \n", + "
342 | \n", + "Find Locations from A Text (Named-Entity Recog... | \n", + "\n", + " | left_only | \n", + "
\n", + " | name | \n", + "url | \n", + "_merge | \n", + "
---|---|---|---|
1343 | \n", + "ANNIS | \n", + "\n", + " | right_only | \n", + "
1344 | \n", + "Adobe Flash | \n", + "\n", + " | right_only | \n", + "
1345 | \n", + "Ainm.ie | \n", + "\n", + " | right_only | \n", + "
1346 | \n", + "Alpheios | \n", + "\n", + " | right_only | \n", + "
1347 | \n", + "Anastasia | \n", + "\n", + " | right_only | \n", + "
1348 | \n", + "ArcExplorer | \n", + "\n", + " | right_only | \n", + "
1349 | \n", + "AroniSmartIntelligence™ | \n", + "\n", + " | right_only | \n", + "
1350 | \n", + "Aruspix | \n", + "\n", + " | right_only | \n", + "
1351 | \n", + "BASE | \n", + "\n", + " | right_only | \n", + "
1352 | \n", + "Basement Waterproofing: Tips and Instructions | \n", + "\n", + " | right_only | \n", + "
1353 | \n", + "Berkeley Parser | \n", + "\n", + " | right_only | \n", + "
1354 | \n", + "CATMA (Computer Aided Textual Markup and Analy... | \n", + "http://www.catma.de/ | \n", + "right_only | \n", + "
1355 | \n", + "Canva \"The Amazingly Simple Graphic Design Sof... | \n", + "\n", + " | right_only | \n", + "
1356 | \n", + "Chicken | \n", + "\n", + " | right_only | \n", + "
1357 | \n", + "CloudConvert | \n", + "\n", + " | right_only | \n", + "
1358 | \n", + "Collocate | \n", + "http:// | \n", + "right_only | \n", + "
1359 | \n", + "Commentpress | \n", + "\n", + " | right_only | \n", + "
1360 | \n", + "CoolTool NeuroLab | \n", + "\n", + " | right_only | \n", + "
1361 | \n", + "Datapress | \n", + "\n", + " | right_only | \n", + "
1362 | \n", + "Delicious | \n", + "\n", + " | right_only | \n", + "
\n", + " | name | \n", + "url | \n", + "last_updated | \n", + "
---|---|---|---|
423 | \n", + "List Words - HTML (TAPoRware) | \n", + "http://taporware.ualberta.ca/~taporware/htmlTo... | \n", + "2011-11-27 | \n", + "
424 | \n", + "List Words - XML (TAPoRware) | \n", + "http://taporware.ualberta.ca/~taporware/xmlToo... | \n", + "2011-11-27 | \n", + "
425 | \n", + "List Words - Plain Text (TAPoRware) | \n", + "http://taporware.ualberta.ca/~taporware/textTo... | \n", + "2011-11-28 | \n", + "
426 | \n", + "List Tags - HTML (TAPoRware) | \n", + "http://taporware.ualberta.ca/~taporware/htmlTo... | \n", + "2011-11-28 | \n", + "
427 | \n", + "List XML Elements (TAPoRware) | \n", + "http://taporware.ualberta.ca/~taporware/xmlToo... | \n", + "2011-11-28 | \n", + "
\n", + " | url | \n", + "status | \n", + "
---|---|---|
0 | \n", + "test | \n", + "1.0 | \n", + "
1 | \n", + "http://taporware.ualberta.ca/~taporware/htmlTo... | \n", + "404.0 | \n", + "
2 | \n", + "http://taporware.ualberta.ca/~taporware/textTo... | \n", + "404.0 | \n", + "
3 | \n", + "http://taporware.ualberta.ca/~taporware/htmlTo... | \n", + "404.0 | \n", + "
4 | \n", + "http://taporware.ualberta.ca/~taporware/textTo... | \n", + "404.0 | \n", + "
\n", + " | name | \n", + "
---|---|
0 | \n", + "Type of analysis | \n", + "
1 | \n", + "Type of license | \n", + "
2 | \n", + "Background Processing | \n", + "
3 | \n", + "Web Usable | \n", + "
4 | \n", + "Ease of Use | \n", + "
5 | \n", + "Warning | \n", + "
6 | \n", + "Usage | \n", + "
7 | \n", + "Tool Family | \n", + "
8 | \n", + "Historic Tool (developed before 2005) | \n", + "
9 | \n", + "Compute Canada | \n", + "
10 | \n", + "Link to Recipe | \n", + "
11 | \n", + "TaDiRAH Goals | \n", + "
12 | \n", + "TaDiRAH Methods | \n", + "
\n", + " | id | \n", + "name | \n", + "creators_name | \n", + "url | \n", + "
---|---|---|---|---|
0 | \n", + "579 | \n", + "Voyant 2.0: Knots | \n", + "Stéfan Sinclair and Geoffrey Rockwell | \n", + "http://voyant-tools.org/?view=knots | \n", + "
1 | \n", + "591 | \n", + "Warc Extractor | \n", + "Ryan Chartier & Internet Archive | \n", + "https://github.com/recrm/ArchiveTools/blob/mas... | \n", + "
2 | \n", + "754 | \n", + "TAGS https://t.co/T007ezdZoA | \n", + "\n", + " | None | \n", + "
3 | \n", + "755 | \n", + "Multiple enhancements to DiRT Directory (tools... | \n", + "\n", + " | None | \n", + "
4 | \n", + "758 | \n", + "RT : Today's \"dirt\": DiRT now uses TaDiRAH ter... | \n", + "\n", + " | None | \n", + "
5 | \n", + "823 | \n", + "Basement Waterproofing: Tips and Instructions | \n", + "\n", + " | None | \n", + "
6 | \n", + "1017 | \n", + "Datapress | \n", + "MIT CSAIL | \n", + "None | \n", + "
7 | \n", + "1063 | \n", + "WordVenture | \n", + "WordNet | \n", + "None | \n", + "
8 | \n", + "1174 | \n", + "VoiceThread | \n", + "VoiceThread LLC | \n", + "None | \n", + "
9 | \n", + "1183 | \n", + "Purdue OWL | \n", + "Purdue University Writing Lab, Purdue Universi... | \n", + "None | \n", + "
10 | \n", + "1352 | \n", + "Aruspix | \n", + "\n", + " | None | \n", + "
11 | \n", + "1369 | \n", + "MMax2 | \n", + "\n", + " | None | \n", + "
12 | \n", + "1377 | \n", + "Lextek | \n", + "\n", + " | None | \n", + "
\n", + " | id | \n", + "name | \n", + "detail | \n", + "creators_name | \n", + "last_updated | \n", + "attributetype | \n", + "attribute | \n", + "
---|---|---|---|---|---|---|---|
43724 | \n", + "1499 | \n", + "iPhoto | \n", + "<p>iPhoto is a digital photograph manipulation... | \n", + "Apple | \n", + "2018-10-12 | \n", + "Type of analysis | \n", + "Organizing | \n", + "
43726 | \n", + "1499 | \n", + "iPhoto | \n", + "<p>iPhoto is a digital photograph manipulation... | \n", + "Apple | \n", + "2018-10-12 | \n", + "Type of analysis | \n", + "Storage | \n", + "
43748 | \n", + "1500 | \n", + "Google 3D Warehouse | \n", + "<p>A collection of free-to-download 3D models ... | \n", + "2018-11-06 | \n", + "Type of analysis | \n", + "Collaboration | \n", + "|
43749 | \n", + "1500 | \n", + "Google 3D Warehouse | \n", + "<p>A collection of free-to-download 3D models ... | \n", + "2018-11-06 | \n", + "Type of analysis | \n", + "Dissemination | \n", + "|
43750 | \n", + "1500 | \n", + "Google 3D Warehouse | \n", + "<p>A collection of free-to-download 3D models ... | \n", + "2018-11-06 | \n", + "Type of analysis | \n", + "Modeling | \n", + "|
43759 | \n", + "1501 | \n", + "SketchUp (Formerly Google SketchUp) | \n", + "<p>Google SketchUp is easy-to-use free 3D mode... | \n", + "2018-10-26 | \n", + "Type of analysis | \n", + "Creation | \n", + "|
43760 | \n", + "1501 | \n", + "SketchUp (Formerly Google SketchUp) | \n", + "<p>Google SketchUp is easy-to-use free 3D mode... | \n", + "2018-10-26 | \n", + "Type of analysis | \n", + "Interpretation | \n", + "|
43761 | \n", + "1501 | \n", + "SketchUp (Formerly Google SketchUp) | \n", + "<p>Google SketchUp is easy-to-use free 3D mode... | \n", + "2018-10-26 | \n", + "Type of analysis | \n", + "Modeling | \n", + "|
43790 | \n", + "1502 | \n", + "GIMP (GNU Image Manipulation Program) | \n", + "<p>GIMP is image editing software, much like P... | \n", + "GIMP Team | \n", + "None | \n", + "Type of analysis | \n", + "Creation | \n", + "
43818 | \n", + "1556 | \n", + "Reaper | \n", + "REAPER is a complete digital audio production ... | \n", + "Cockos | \n", + "2019-03-24 | \n", + "Type of analysis | \n", + "Creation | \n", + "
iPhoto is a digital photograph manipulation... Apple \n", + "43726
iPhoto is a digital photograph manipulation... Apple \n", + "43748
A collection of free-to-download 3D models ... Google \n", + "43749
A collection of free-to-download 3D models ... Google \n", + "43750
A collection of free-to-download 3D models ... Google \n", + "43759
Google SketchUp is easy-to-use free 3D mode... Google \n", + "43760
Google SketchUp is easy-to-use free 3D mode... Google \n", + "43761
Google SketchUp is easy-to-use free 3D mode... Google \n", + "43790
GIMP is image editing software, much like P... GIMP Team \n",
+ "43818 REAPER is a complete digital audio production ... Cockos \n",
+ "\n",
+ " last_updated attributetype attribute \n",
+ "43724 2018-10-12 Type of analysis Organizing \n",
+ "43726 2018-10-12 Type of analysis Storage \n",
+ "43748 2018-11-06 Type of analysis Collaboration \n",
+ "43749 2018-11-06 Type of analysis Dissemination \n",
+ "43750 2018-11-06 Type of analysis Modeling \n",
+ "43759 2018-10-26 Type of analysis Creation \n",
+ "43760 2018-10-26 Type of analysis Interpretation \n",
+ "43761 2018-10-26 Type of analysis Modeling \n",
+ "43790 None Type of analysis Creation \n",
+ "43818 2019-03-24 Type of analysis Creation "
+ ]
+ },
+ "execution_count": 119,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_to_ta=df_db_sub[df_db_sub['attributetype'] == 'Type of analysis'].drop_duplicates()\n",
+ "df_to_ta.tail(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Analysis 434\n",
+ "Visualization 236\n",
+ "Content Analysis 185\n",
+ "Search 139\n",
+ "Natural Language Processing 125\n",
+ "Discovering 124\n",
+ "Capture 113\n",
+ "Gathering 97\n",
+ "Publishing 92\n",
+ "Dissemination 91\n",
+ "Enrichment 90\n",
+ "Annotating 83\n",
+ "Collaboration 80\n",
+ "Organizing 71\n",
+ "Creation 52\n",
+ "Uncategorized 49\n",
+ "Storage 40\n",
+ "Web development 39\n",
+ "Modeling 25\n",
+ "Programming 22\n",
+ "Interpretation 18\n",
+ "RDF 12\n",
+ "Name: attribute, dtype: int64"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_db_a = df_to_ta['attribute'].value_counts()\n",
+ "df_db_a.head(25)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "