Merge pull request #14 from HeardLibrary/vanderbot_v1-5

Vanderbot v1.5
HeardLibrary · Sep 8, 2020 · 73baf3d · 73baf3d
2 parents 54bd94c + dd90b0a
commit 73baf3d
Show file tree

Hide file tree

Showing 12 changed files with 601 additions and 234 deletions.
diff --git a/vanderbot/README.md b/vanderbot/README.md
@@ -32,7 +32,7 @@ Here are some queries that can be run to explore the data:
 
 [Number of clinical trials at Vanderbilt by principal investigator](https://w.wiki/XKK)
 
-The current release is [v1.4](https://github.com/HeardLibrary/linked-data/releases/tag/v1.4).
+The current release is [v1.5](https://github.com/HeardLibrary/linked-data/releases/tag/v1.5).
 
 ## How it works
 
@@ -147,5 +147,11 @@ The changes made in this release were made following tests that used the `csv-me
 
 The first five scripts were not changed in this release.
 
+## Release v1.5 (2020-09-08)
+
+The major change to the code was to increase the number of table columns per date from one to three. Previously, there was a single column for the date string. However, this did not allow for varying date precision. Now there is an additional column for the Wikibase date precision number (e.g. 9 for year, 11 for date to day). The third column is for a date value node identifier. This can either be the actual node identifier from Wikidata (a hash of unknown origin) or a random UUID generated by one of the scripts in this suite. This identifies the node to which both the date value and date precision are attached. It effectively serves as a blank node. In the future, it may be replaced with the actula date node identifier.
+
+The other addition is a Javascript script written by Jessie Baskauf that drives [this form](https://heardlibrary.github.io/digital-scholarship/script/wikidata/wikidata-csv2rdf-metadata.html), which can be used to generate a `csv-metadata.json` mapping schema. With such a mapping schema, any CSV can be used as the source date for the **vb6_upload_wikidata.py** upload script.
+
 ----
-Revised 2020-08-28
+Revised 2020-09-08
diff --git a/vanderbot/csv-metadata.json b/vanderbot/csv-metadata.json
@@ -71,11 +71,26 @@
             "valueUrl": "http://www.wikidata.org/reference/{orcidReferenceHash}"
           },
           {
-            "titles": "orcidReferenceValue",
-            "name": "orcidReferenceValue",
-            "datatype": "dateTime",
+            "titles": "orcidReferenceValue_nodeId",
+            "name": "orcidReferenceValue_nodeId",
+            "datatype": "string",
             "aboutUrl": "http://www.wikidata.org/reference/{orcidReferenceHash}",
-            "propertyUrl": "http://www.wikidata.org/prop/reference/P813"
+            "propertyUrl": "http://www.wikidata.org/prop/reference/value/P813",
+            "valueUrl": "http://example.com/.well-known/genid/{orcidReferenceValue_nodeId}"
+          },
+          {
+            "titles": "orcidReferenceValue_val",
+            "name": "orcidReferenceValue_val",
+            "datatype": "dateTime",
+            "aboutUrl": "http://example.com/.well-known/genid/{orcidReferenceValue_nodeId}",
+            "propertyUrl": "http://wikiba.se/ontology#timeValue"
+          },
+          {
+            "titles": "orcidReferenceValue_prec",
+            "name": "orcidReferenceValue_prec",
+            "datatype": "integer",
+            "aboutUrl": "http://example.com/.well-known/genid/{orcidReferenceValue_nodeId}",
+            "propertyUrl": "http://wikiba.se/ontology#timePrecision"
           },
           {
             "titles": "employerStatementUuid",
@@ -110,11 +125,26 @@
             "valueUrl": "{+employerReferenceSourceUrl}"
           },
           {
-            "titles": "employerReferenceRetrieved",
-            "name": "employerReferenceRetrieved",
-            "datatype": "dateTime",
+            "titles": "employerReferenceRetrieved_nodeId",
+            "name": "employerReferenceRetrieved_nodeId",
+            "datatype": "string",
             "aboutUrl": "http://www.wikidata.org/reference/{employerReferenceHash}",
-            "propertyUrl": "http://www.wikidata.org/prop/reference/P813"
+            "propertyUrl": "http://www.wikidata.org/prop/reference/value/P813",
+            "valueUrl": "http://example.com/.well-known/genid/{employerReferenceRetrieved_nodeId}"
+          },
+          {
+            "titles": "employerReferenceRetrieved_val",
+            "name": "employerReferenceRetrieved_val",
+            "datatype": "dateTime",
+            "aboutUrl": "http://example.com/.well-known/genid/{employerReferenceRetrieved_nodeId}",
+            "propertyUrl": "http://wikiba.se/ontology#timeValue"
+          },
+          {
+            "titles": "employerReferenceRetrieved_prec",
+            "name": "employerReferenceRetrieved_prec",
+            "datatype": "integer",
+            "aboutUrl": "http://example.com/.well-known/genid/{employerReferenceRetrieved_nodeId}",
+            "propertyUrl": "http://wikiba.se/ontology#timePrecision"
           },
           {
             "titles": "affiliationStatementUuid",
@@ -149,11 +179,26 @@
             "valueUrl": "{+affiliationReferenceSourceUrl}"
           },
           {
-            "titles": "affiliationReferenceRetrieved",
-            "name": "affiliationReferenceRetrieved",
-            "datatype": "dateTime",
+            "titles": "affiliationReferenceRetrieved_nodeId",
+            "name": "affiliationReferenceRetrieved_nodeId",
+            "datatype": "string",
             "aboutUrl": "http://www.wikidata.org/reference/{affiliationReferenceHash}",
-            "propertyUrl": "http://www.wikidata.org/prop/reference/P813"
+            "propertyUrl": "http://www.wikidata.org/prop/reference/value/P813",
+            "valueUrl": "http://example.com/.well-known/genid/{affiliationReferenceRetrieved_nodeId}"
+          },
+          {
+            "titles": "affiliationReferenceRetrieved_val",
+            "name": "affiliationReferenceRetrieved_val",
+            "datatype": "dateTime",
+            "aboutUrl": "http://example.com/.well-known/genid/{affiliationReferenceRetrieved_nodeId}",
+            "propertyUrl": "http://wikiba.se/ontology#timeValue"
+          },
+          {
+            "titles": "affiliationReferenceRetrieved_prec",
+            "name": "affiliationReferenceRetrieved_prec",
+            "datatype": "integer",
+            "aboutUrl": "http://example.com/.well-known/genid/{affiliationReferenceRetrieved_nodeId}",
+            "propertyUrl": "http://wikiba.se/ontology#timePrecision"
           },
           {
             "titles": "instanceOfUuid",

diff --git a/vanderbot/generate_direct_props.ipynb b/vanderbot/generate_direct_props.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,7 +24,7 @@
     "# Configuration data\n",
     "# ---------------\n",
     "\n",
-    "graph_name = 'https://github.com/HeardLibrary/linked-data/blob/29e5d02aaf00cb890792d7dee73707603a506b3e/json_schema/bluffton_presidents.csv'\n",
+    "graph_name = 'https://raw.githubusercontent.com/HeardLibrary/linked-data/54bd94c609e9c5af6c558cd926939ded67cba2ae/json_schema/bluffton_presidents.csv'\n",
     "accept_media_type = 'text/turtle'\n",
     "sparql_endpoint = \"https://sparql.vanderbilt.edu/sparql\"\n",
     "request_header_dictionary = {\n",
@@ -59,7 +59,7 @@
     "        exit()\n",
     "    return(cred)\n",
     "\n",
-    "def retrieve_direct_statements(sparql_endpoint):\n",
+    "def retrieve_direct_statements(sparql_endpoint, graph_name):\n",
     "    query = '''\n",
     "construct {?item ?directProp ?value.}\n",
     "from <''' + graph_name + '''>\n",
@@ -76,90 +76,84 @@
     "    r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)\n",
     "    return r.text\n",
     "\n",
+    "def retrieve_time_statements(sparql_endpoint, graph_name, subject_type):\n",
+    "    # Happily, each subject type: \"statement\", \"reference\", and \"qualifier\" contains 9 characters.\n",
+    "    # so the string extraction is the same for all.\n",
+    "    query = '''\n",
+    "prefix wikibase: <http://wikiba.se/ontology#>\n",
+    "construct {?subject ?directProp ?timeValue.}\n",
+    "from <''' + graph_name + '''>\n",
+    "where {\n",
+    "  ?subject ?valueProperty ?value.\n",
+    "  ?value wikibase:timeValue ?timeValue.\n",
+    "  filter(substr(str(?valueProperty),1,45)=\"http://www.wikidata.org/prop/''' + subject_type + '''/value/\")\n",
+    "  bind(substr(str(?valueProperty),46) as ?id)\n",
+    "  bind(iri(concat(\"http://www.wikidata.org/prop/''' + subject_type + '''/\", ?id)) as ?directProp)\n",
+    "  }\n",
+    "'''\n",
+    "    results = []\n",
+    "    r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)\n",
+    "    return r.text\n",
+    "\n",
     "def perform_sparql_update(sparql_endpoint, pwd, update_command):\n",
     "    # SPARQL Update requires HTTP POST\n",
     "    hdr = {'Content-Type' : 'application/sparql-update'}\n",
     "    r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)\n",
     "    print(str(r.status_code) + ' ' + r.url)\n",
-    "    print(r.text)\n"
+    "    print(r.text)\n",
+    "\n",
+    "def prep_and_update(sparql_endpoint, pwd, graph_name, graph_text):\n",
+    "    # remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated\n",
+    "    graph_text_list = graph_text.split('\\n')\n",
+    "    # print(graph_text_list)\n",
+    "    graph_text = ''\n",
+    "    for line in graph_text_list:\n",
+    "        try:\n",
+    "            if line[0] != '@':\n",
+    "                graph_text += line + '\\n'\n",
+    "        except:\n",
+    "            pass\n",
+    "    #print()\n",
+    "    #print(graph_text)\n",
+    "\n",
+    "    if len(graph_text) != 0: # don't perform an update if there aren't any triples to add\n",
+    "        # Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph\n",
+    "        update_command = '''INSERT DATA\n",
+    "        { GRAPH <''' + graph_name + '''> { \n",
+    "        ''' + graph_text + '''\n",
+    "        }}'''\n",
+    "\n",
+    "        #print(update_command)\n",
+    "        perform_sparql_update(sparql_endpoint, pwd, update_command)\n",
+    "    else:\n",
+    "        print('no triples to write')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "constructed triples retrieved\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# ---------------\n",
     "# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint \n",
     "# ---------------\n",
     "pwd = load_credential(filename, directory)\n",
     "\n",
-    "graph_text = retrieve_direct_statements(sparql_endpoint)\n",
+    "graph_text = retrieve_direct_statements(sparql_endpoint, graph_name)\n",
     "#print(graph_text)\n",
-    "print('constructed triples retrieved')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated\n",
-    "graph_text_list = graph_text.split('\\n')\n",
-    "# print(graph_text_list)\n",
-    "graph_text = ''\n",
-    "for line in graph_text_list:\n",
-    "    try:\n",
-    "        if line[0] != '@':\n",
-    "            graph_text += line + '\\n'\n",
-    "    except:\n",
-    "        pass\n",
-    "#print()\n",
-    "#print(graph_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "200 https://sparql.vanderbilt.edu/sparql\n",
-      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><head><meta http-equiv=\"Content-Type\" content=\"text&#47;html;charset=UTF-8\"><title>blazegraph&trade; by SYSTAP</title\n",
-      "></head\n",
-      "><body<p>totalElapsed=0ms, elapsed=0ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p\n",
-      "><hr><p>COMMIT: totalElapsed=251ms, commitTime=1598157003429, mutationCount=40</p\n",
-      "></html\n",
-      ">\n",
-      "\n",
-      "done\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph\n",
+    "print('constructed direct triples retrieved')\n",
     "\n",
-    "update_command = '''INSERT DATA\n",
-    "{ GRAPH <''' + graph_name + '''> { \n",
-    "''' + graph_text + '''\n",
-    "}}'''\n",
+    "prep_and_update(sparql_endpoint, pwd, graph_name, graph_text)\n",
+    "print()\n",
     "\n",
-    "#print(update_command)\n",
+    "for subject_type in ['statement', 'reference', 'qualifier']:\n",
+    "    graph_text = retrieve_time_statements(sparql_endpoint, graph_name, subject_type)\n",
+    "    #print(graph_text)\n",
+    "    print('constructed direct ' + subject_type + ' time triples retrieved')\n",
     "\n",
-    "perform_sparql_update(sparql_endpoint, pwd, update_command)\n",
+    "    prep_and_update(sparql_endpoint, pwd, graph_name, graph_text)\n",
+    "    print()\n",
     "\n",
     "print()\n",
     "print('done')"