Merge pull request #8 from HeardLibrary/v1-4

v1.4 release
HeardLibrary · Aug 18, 2020 · 2086028 · 2086028
2 parents cc7609d + 3571972
commit 2086028
Show file tree

Hide file tree

Showing 5 changed files with 317 additions and 46 deletions.
diff --git a/vanderbot/README.md b/vanderbot/README.md
@@ -32,7 +32,7 @@ Here are some queries that can be run to explore the data:
 
 [Number of clinical trials at Vanderbilt by principal investigator](https://w.wiki/XKK)
 
-The current release is [v1.3](https://github.com/HeardLibrary/linked-data/releases/tag/v1.3).
+The current release is [v1.4](https://github.com/HeardLibrary/linked-data/releases/tag/v1.4).
 
 ## How it works
 
@@ -138,5 +138,11 @@ In the case where there are no reference properties, there also isn't any refere
 If there are reference property combinations other than this, the `generate_statement_data()` function can't be used and custom code must be written for that statement.
 
 
+## Release v1.4 (2020-08-17) notes 
+
+The changes made in this release were made following tests that used the `csv-metadata.json` mapping schema to emit RDF from the source CSV tables. In order to make it possible to create all of the kinds of statements present in the Wikidata data model, the `csv-metadata.json` file and `vb6_upload_wikidata.py` script were changed to use the `ps:` namespace (`http://www.wikidata.org/prop/statement/`) properties rather than the `wdt:` namespace properties. This makes it possible to construct the missing `wdt:` statements using SPARQL CONSTRUCT. [A new script](https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/generate_direct_props.py) materializes those triples by a CONSTRUCT query to a SPARQL endpoint whose triplestore contains the triples generated by the schema. Those materialized triples are then loaded into the triplestore, making it possible to perform queries on any graph pattern that can be used at the Wikidata Query Service SPARQL endpoint.
+
+The first five scripts were not changed in this release.
+
 ----
-Revised 2020-04-23
+Revised 2020-08-17
diff --git a/vanderbot/csv-metadata.json b/vanderbot/csv-metadata.json
@@ -59,8 +59,8 @@
             "titles": "orcid",
             "name": "orcid",
             "datatype": "string",
-            "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
-            "propertyUrl": "http://www.wikidata.org/prop/direct/P496"
+            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{orcidStatementUuid}",
+            "propertyUrl": "http://www.wikidata.org/prop/statement/P496"
           },
           {
             "titles": "orcidReferenceHash",
@@ -89,8 +89,8 @@
             "titles": "employer",
             "name": "employer",
             "datatype": "string",
-            "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
-            "propertyUrl": "http://www.wikidata.org/prop/direct/P108",
+            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}",
+            "propertyUrl": "http://www.wikidata.org/prop/statement/P108",
             "valueUrl": "http://www.wikidata.org/entity/{employer}"
           },
           {
@@ -128,8 +128,8 @@
             "titles": "affiliation",
             "name": "affiliation",
             "datatype": "string",
-            "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
-            "propertyUrl": "http://www.wikidata.org/prop/direct/P1416",
+            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}",
+            "propertyUrl": "http://www.wikidata.org/prop/statement/P1416",
             "valueUrl": "http://www.wikidata.org/entity/{affiliation}"
           },
           {
@@ -167,8 +167,8 @@
             "titles": "instanceOf",
             "name": "instanceOf",
             "datatype": "string",
-            "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
-            "propertyUrl": "http://www.wikidata.org/prop/direct/P31",
+            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}",
+            "propertyUrl": "http://www.wikidata.org/prop/statement/P31",
             "valueUrl": "http://www.wikidata.org/entity/{instanceOf}"
           },
           {
@@ -183,34 +183,6 @@
             "titles": "sexOrGenderQId",
             "name": "sexOrGenderQId",
             "datatype": "string",
-            "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
-            "propertyUrl": "http://www.wikidata.org/prop/direct/P21",
-            "valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}"
-          },
-          {
-            "name": "employerPropertyStatement",
-            "virtual": true,
-            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}",
-            "propertyUrl": "http://www.wikidata.org/prop/statement/P108",
-            "valueUrl": "http://www.wikidata.org/entity/{employer}"
-          },
-          {
-            "name": "affiliationPropertyStatement",
-            "virtual": true,
-            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}",
-            "propertyUrl": "http://www.wikidata.org/prop/statement/P1416",
-            "valueUrl": "http://www.wikidata.org/entity/{affiliation}"
-          },
-          {
-            "name": "instanceOfPropertyStatement",
-            "virtual": true,
-            "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}",
-            "propertyUrl": "http://www.wikidata.org/prop/statement/P31",
-            "valueUrl": "http://www.wikidata.org/entity/{instanceOf}"
-          },
-          {
-            "name": "sexOrGenderPropertyStatement",
-            "virtual": true,
             "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{sexOrGenderUuid}",
             "propertyUrl": "http://www.wikidata.org/prop/statement/P21",
             "valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}"

diff --git a/vanderbot/generate_direct_props.ipynb b/vanderbot/generate_direct_props.ipynb
@@ -0,0 +1,189 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests   # best library to manage HTTP transactions\n",
+    "from bs4 import BeautifulSoup # web-scraping library\n",
+    "import json\n",
+    "from time import sleep\n",
+    "import csv\n",
+    "import math\n",
+    "from fuzzywuzzy import fuzz # fuzzy logic matching\n",
+    "from fuzzywuzzy import process\n",
+    "import xml.etree.ElementTree as et # library to traverse XML tree\n",
+    "import urllib\n",
+    "import datetime\n",
+    "import string\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# ---------------\n",
+    "# Configuration data\n",
+    "# ---------------\n",
+    "\n",
+    "graph_name = 'http://nursing'\n",
+    "accept_media_type = 'text/turtle'\n",
+    "sparql_endpoint = \"https://sparql.vanderbilt.edu/sparql\"\n",
+    "request_header_dictionary = {\n",
+    "    #'Content-Type': 'application/sparql-query',\n",
+    "    'Accept' : accept_media_type\n",
+    "}\n",
+    "\n",
+    "# Load endpoint password from file in home directory\n",
+    "directory = 'home'\n",
+    "filename = 'sparql_vanderbilt_edu_password.txt'\n",
+    "pwd = load_credential(filename, directory)\n",
+    "\n",
+    "# ---------------\n",
+    "# Function definitions\n",
+    "# ---------------\n",
+    "\n",
+    "# Load password from local drive\n",
+    "# value of directory should be either 'home' or 'working'\n",
+    "def load_credential(filename, directory):\n",
+    "    cred = ''\n",
+    "    # to change the script to look for the credential in the working directory, change the value of home to empty string\n",
+    "    if directory == 'home':\n",
+    "        home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac\n",
+    "        credential_path = home + '/' + filename\n",
+    "    else:\n",
+    "        directory = 'working'\n",
+    "        credential_path = filename\n",
+    "    try:\n",
+    "        with open(credential_path, 'rt', encoding='utf-8') as file_object:\n",
+    "            cred = file_object.read()\n",
+    "    except:\n",
+    "        print(filename + ' file not found - is it in your ' + directory + ' directory?')\n",
+    "        exit()\n",
+    "    return(cred)\n",
+    "\n",
+    "def retrieve_direct_statements(sparql_endpoint):\n",
+    "    query = '''\n",
+    "construct {?item ?directProp ?value.}\n",
+    "from <''' + graph_name + '''>\n",
+    "where {\n",
+    "  ?item ?p ?statement.\n",
+    "  ?statement ?ps ?value.\n",
+    "  filter(substr(str(?ps),1,39)=\"http://www.wikidata.org/prop/statement/\")\n",
+    "  bind(substr(str(?ps),40) as ?id)\n",
+    "  bind(substr(str(?p),30) as ?id)\n",
+    "  bind(iri(concat(\"http://www.wikidata.org/prop/direct/\", ?id)) as ?directProp)\n",
+    "  }\n",
+    "'''\n",
+    "    results = []\n",
+    "    r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)\n",
+    "    return r.text\n",
+    "\n",
+    "def perform_sparql_update(sparql_endpoint, pwd, update_command):\n",
+    "    # SPARQL Update requires HTTP POST\n",
+    "    hdr = {'Content-Type' : 'application/sparql-update'}\n",
+    "    r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)\n",
+    "    print(str(r.status_code) + ' ' + r.url)\n",
+    "    print(r.text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---------------\n",
+    "# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint \n",
+    "# ---------------\n",
+    "\n",
+    "graph_text = retrieve_direct_statements(sparql_endpoint)\n",
+    "#print(graph_text)\n",
+    "print('constructed triples retrieved')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated\n",
+    "graph_text_list = graph_text.split('\\n')\n",
+    "# print(graph_text_list)\n",
+    "graph_text = ''\n",
+    "for line in graph_text_list:\n",
+    "    try:\n",
+    "        if line[0] != '@':\n",
+    "            graph_text += line + '\\n'\n",
+    "    except:\n",
+    "        pass\n",
+    "#print()\n",
+    "#print(graph_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "200 https://sparql.vanderbilt.edu/sparql\n",
+      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><head><meta http-equiv=\"Content-Type\" content=\"text&#47;html;charset=UTF-8\"><title>blazegraph&trade; by SYSTAP</title\n",
+      "></head\n",
+      "><body<p>totalElapsed=1ms, elapsed=1ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p\n",
+      "><hr><p>COMMIT: totalElapsed=356ms, commitTime=1596944443099, mutationCount=776</p\n",
+      "></html\n",
+      ">\n",
+      "\n",
+      "done\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph\n",
+    "\n",
+    "update_command = '''INSERT DATA\n",
+    "{ GRAPH <''' + graph_name + '''> { \n",
+    "''' + graph_text + '''\n",
+    "}}'''\n",
+    "\n",
+    "#print(update_command)\n",
+    "\n",
+    "perform_sparql_update(sparql_endpoint, pwd, update_command)\n",
+    "\n",
+    "print()\n",
+    "print('done')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/vanderbot/generate_direct_props.py b/vanderbot/generate_direct_props.py
@@ -0,0 +1,101 @@
+import requests   # best library to manage HTTP transactions
+import json
+from time import sleep
+import csv
+import math
+import urllib
+import datetime
+import string
+from pathlib import Path
+
+# ---------------
+# Configuration data
+# ---------------
+
+graph_name = 'http://nursing'
+accept_media_type = 'text/turtle'
+sparql_endpoint = "https://sparql.vanderbilt.edu/sparql"
+request_header_dictionary = {
+    #'Content-Type': 'application/sparql-query',
+    'Accept' : accept_media_type
+}
+
+# Load endpoint password from file in home directory
+directory = 'home'
+filename = 'sparql_vanderbilt_edu_password.txt'
+
+# ---------------
+# Function definitions
+# ---------------
+
+# Load password from local drive
+# value of directory should be either 'home' or 'working'
+def load_credential(filename, directory):
+    cred = ''
+    # to change the script to look for the credential in the working directory, change the value of home to empty string
+    if directory == 'home':
+        home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
+        credential_path = home + '/' + filename
+    else:
+        directory = 'working'
+        credential_path = filename
+    try:
+        with open(credential_path, 'rt', encoding='utf-8') as file_object:
+            cred = file_object.read()
+    except:
+        print(filename + ' file not found - is it in your ' + directory + ' directory?')
+        exit()
+    return(cred)
+
+def retrieve_direct_statements(sparql_endpoint):
+    query = '''
+construct {?item ?directProp ?value.}
+from <''' + graph_name + '''>
+where {
+  ?item ?p ?statement.
+  ?statement ?ps ?value.
+  filter(substr(str(?ps),1,39)="http://www.wikidata.org/prop/statement/")
+  bind(substr(str(?ps),40) as ?id)
+  bind(substr(str(?p),30) as ?id)
+  bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?directProp)
+  }
+'''
+    r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)
+    return r.text
+
+def perform_sparql_update(sparql_endpoint, pwd, update_command):
+    # SPARQL Update requires HTTP POST
+    hdr = {'Content-Type' : 'application/sparql-update'}
+    r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)
+    print(str(r.status_code) + ' ' + r.url)
+    print(r.text)
+
+# ---------------
+# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint 
+# ---------------
+
+graph_text = retrieve_direct_statements(sparql_endpoint)
+print('constructed triples retrieved')
+
+# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated
+graph_text_list = graph_text.split('\n')
+graph_text = ''
+for line in graph_text_list:
+    try:
+        if line[0] != '@':
+            graph_text += line + '\n'
+    except:
+        pass
+
+# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph
+
+update_command = '''INSERT DATA
+{ GRAPH <''' + graph_name + '''> { 
+''' + graph_text + '''
+}}'''
+
+pwd = load_credential(filename, directory)
+perform_sparql_update(sparql_endpoint, pwd, update_command)
+
+print()
+print('done')