diff --git a/vanderbot/README.md b/vanderbot/README.md index 0cdf6f80..7fa85215 100644 --- a/vanderbot/README.md +++ b/vanderbot/README.md @@ -32,7 +32,7 @@ Here are some queries that can be run to explore the data: [Number of clinical trials at Vanderbilt by principal investigator](https://w.wiki/XKK) -The current release is [v1.3](https://github.com/HeardLibrary/linked-data/releases/tag/v1.3). +The current release is [v1.4](https://github.com/HeardLibrary/linked-data/releases/tag/v1.4). ## How it works @@ -138,5 +138,11 @@ In the case where there are no reference properties, there also isn't any refere If there are reference property combinations other than this, the `generate_statement_data()` function can't be used and custom code must be written for that statement. +## Release v1.4 (2020-08-17) notes + +The changes made in this release were made following tests that used the `csv-metadata.json` mapping schema to emit RDF from the source CSV tables. In order to make it possible to create all of the kinds of statements present in the Wikidata data model, the `csv-metadata.json` file and `vb6_upload_wikidata.py` script were changed to use the `ps:` namespace (`http://www.wikidata.org/prop/statement/`) properties rather than the `wdt:` namespace properties. This makes it possible to construct the missing `wdt:` statements using SPARQL CONSTRUCT. [A new script](https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/generate_direct_props.py) materializes those triples by a CONSTRUCT query to a SPARQL endpoint whose triplestore contains the triples generated by the schema. Those materialized triples are then loaded into the triplestore, making it possible to perform queries on any graph pattern that can be used at the Wikidata Query Service SPARQL endpoint. + +The first five scripts were not changed in this release. + ---- -Revised 2020-04-23 +Revised 2020-08-17 diff --git a/vanderbot/csv-metadata.json b/vanderbot/csv-metadata.json index 1f8badbc..087db992 100644 --- a/vanderbot/csv-metadata.json +++ b/vanderbot/csv-metadata.json @@ -59,8 +59,8 @@ "titles": "orcid", "name": "orcid", "datatype": "string", - "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}", - "propertyUrl": "http://www.wikidata.org/prop/direct/P496" + "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{orcidStatementUuid}", + "propertyUrl": "http://www.wikidata.org/prop/statement/P496" }, { "titles": "orcidReferenceHash", @@ -89,8 +89,8 @@ "titles": "employer", "name": "employer", "datatype": "string", - "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}", - "propertyUrl": "http://www.wikidata.org/prop/direct/P108", + "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}", + "propertyUrl": "http://www.wikidata.org/prop/statement/P108", "valueUrl": "http://www.wikidata.org/entity/{employer}" }, { @@ -128,8 +128,8 @@ "titles": "affiliation", "name": "affiliation", "datatype": "string", - "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}", - "propertyUrl": "http://www.wikidata.org/prop/direct/P1416", + "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}", + "propertyUrl": "http://www.wikidata.org/prop/statement/P1416", "valueUrl": "http://www.wikidata.org/entity/{affiliation}" }, { @@ -167,8 +167,8 @@ "titles": "instanceOf", "name": "instanceOf", "datatype": "string", - "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}", - "propertyUrl": "http://www.wikidata.org/prop/direct/P31", + "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}", + "propertyUrl": "http://www.wikidata.org/prop/statement/P31", "valueUrl": "http://www.wikidata.org/entity/{instanceOf}" }, { @@ -183,34 +183,6 @@ "titles": "sexOrGenderQId", "name": "sexOrGenderQId", "datatype": "string", - "aboutUrl": "http://www.wikidata.org/entity/{wikidataId}", - "propertyUrl": "http://www.wikidata.org/prop/direct/P21", - "valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}" - }, - { - "name": "employerPropertyStatement", - "virtual": true, - "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}", - "propertyUrl": "http://www.wikidata.org/prop/statement/P108", - "valueUrl": "http://www.wikidata.org/entity/{employer}" - }, - { - "name": "affiliationPropertyStatement", - "virtual": true, - "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}", - "propertyUrl": "http://www.wikidata.org/prop/statement/P1416", - "valueUrl": "http://www.wikidata.org/entity/{affiliation}" - }, - { - "name": "instanceOfPropertyStatement", - "virtual": true, - "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}", - "propertyUrl": "http://www.wikidata.org/prop/statement/P31", - "valueUrl": "http://www.wikidata.org/entity/{instanceOf}" - }, - { - "name": "sexOrGenderPropertyStatement", - "virtual": true, "aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{sexOrGenderUuid}", "propertyUrl": "http://www.wikidata.org/prop/statement/P21", "valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}" diff --git a/vanderbot/generate_direct_props.ipynb b/vanderbot/generate_direct_props.ipynb new file mode 100644 index 00000000..4082b4b5 --- /dev/null +++ b/vanderbot/generate_direct_props.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "import requests # best library to manage HTTP transactions\n", + "from bs4 import BeautifulSoup # web-scraping library\n", + "import json\n", + "from time import sleep\n", + "import csv\n", + "import math\n", + "from fuzzywuzzy import fuzz # fuzzy logic matching\n", + "from fuzzywuzzy import process\n", + "import xml.etree.ElementTree as et # library to traverse XML tree\n", + "import urllib\n", + "import datetime\n", + "import string\n", + "from pathlib import Path\n", + "\n", + "# ---------------\n", + "# Configuration data\n", + "# ---------------\n", + "\n", + "graph_name = 'http://nursing'\n", + "accept_media_type = 'text/turtle'\n", + "sparql_endpoint = \"https://sparql.vanderbilt.edu/sparql\"\n", + "request_header_dictionary = {\n", + " #'Content-Type': 'application/sparql-query',\n", + " 'Accept' : accept_media_type\n", + "}\n", + "\n", + "# Load endpoint password from file in home directory\n", + "directory = 'home'\n", + "filename = 'sparql_vanderbilt_edu_password.txt'\n", + "pwd = load_credential(filename, directory)\n", + "\n", + "# ---------------\n", + "# Function definitions\n", + "# ---------------\n", + "\n", + "# Load password from local drive\n", + "# value of directory should be either 'home' or 'working'\n", + "def load_credential(filename, directory):\n", + " cred = ''\n", + " # to change the script to look for the credential in the working directory, change the value of home to empty string\n", + " if directory == 'home':\n", + " home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac\n", + " credential_path = home + '/' + filename\n", + " else:\n", + " directory = 'working'\n", + " credential_path = filename\n", + " try:\n", + " with open(credential_path, 'rt', encoding='utf-8') as file_object:\n", + " cred = file_object.read()\n", + " except:\n", + " print(filename + ' file not found - is it in your ' + directory + ' directory?')\n", + " exit()\n", + " return(cred)\n", + "\n", + "def retrieve_direct_statements(sparql_endpoint):\n", + " query = '''\n", + "construct {?item ?directProp ?value.}\n", + "from <''' + graph_name + '''>\n", + "where {\n", + " ?item ?p ?statement.\n", + " ?statement ?ps ?value.\n", + " filter(substr(str(?ps),1,39)=\"http://www.wikidata.org/prop/statement/\")\n", + " bind(substr(str(?ps),40) as ?id)\n", + " bind(substr(str(?p),30) as ?id)\n", + " bind(iri(concat(\"http://www.wikidata.org/prop/direct/\", ?id)) as ?directProp)\n", + " }\n", + "'''\n", + " results = []\n", + " r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)\n", + " return r.text\n", + "\n", + "def perform_sparql_update(sparql_endpoint, pwd, update_command):\n", + " # SPARQL Update requires HTTP POST\n", + " hdr = {'Content-Type' : 'application/sparql-update'}\n", + " r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)\n", + " print(str(r.status_code) + ' ' + r.url)\n", + " print(r.text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "# ---------------\n", + "# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint \n", + "# ---------------\n", + "\n", + "graph_text = retrieve_direct_statements(sparql_endpoint)\n", + "#print(graph_text)\n", + "print('constructed triples retrieved')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated\n", + "graph_text_list = graph_text.split('\\n')\n", + "# print(graph_text_list)\n", + "graph_text = ''\n", + "for line in graph_text_list:\n", + " try:\n", + " if line[0] != '@':\n", + " graph_text += line + '\\n'\n", + " except:\n", + " pass\n", + "#print()\n", + "#print(graph_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200 https://sparql.vanderbilt.edu/sparql\n", + "blazegraph™ by SYSTAP</title\n", + "></head\n", + "><body<p>totalElapsed=1ms, elapsed=1ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p\n", + "><hr><p>COMMIT: totalElapsed=356ms, commitTime=1596944443099, mutationCount=776</p\n", + "></html\n", + ">\n", + "\n", + "done\n" + ] + } + ], + "source": [ + "# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph\n", + "\n", + "update_command = '''INSERT DATA\n", + "{ GRAPH <''' + graph_name + '''> { \n", + "''' + graph_text + '''\n", + "}}'''\n", + "\n", + "#print(update_command)\n", + "\n", + "perform_sparql_update(sparql_endpoint, pwd, update_command)\n", + "\n", + "print()\n", + "print('done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/vanderbot/generate_direct_props.py b/vanderbot/generate_direct_props.py new file mode 100644 index 00000000..c172ac6d --- /dev/null +++ b/vanderbot/generate_direct_props.py @@ -0,0 +1,101 @@ +import requests # best library to manage HTTP transactions +import json +from time import sleep +import csv +import math +import urllib +import datetime +import string +from pathlib import Path + +# --------------- +# Configuration data +# --------------- + +graph_name = 'http://nursing' +accept_media_type = 'text/turtle' +sparql_endpoint = "https://sparql.vanderbilt.edu/sparql" +request_header_dictionary = { + #'Content-Type': 'application/sparql-query', + 'Accept' : accept_media_type +} + +# Load endpoint password from file in home directory +directory = 'home' +filename = 'sparql_vanderbilt_edu_password.txt' + +# --------------- +# Function definitions +# --------------- + +# Load password from local drive +# value of directory should be either 'home' or 'working' +def load_credential(filename, directory): + cred = '' + # to change the script to look for the credential in the working directory, change the value of home to empty string + if directory == 'home': + home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac + credential_path = home + '/' + filename + else: + directory = 'working' + credential_path = filename + try: + with open(credential_path, 'rt', encoding='utf-8') as file_object: + cred = file_object.read() + except: + print(filename + ' file not found - is it in your ' + directory + ' directory?') + exit() + return(cred) + +def retrieve_direct_statements(sparql_endpoint): + query = ''' +construct {?item ?directProp ?value.} +from <''' + graph_name + '''> +where { + ?item ?p ?statement. + ?statement ?ps ?value. + filter(substr(str(?ps),1,39)="http://www.wikidata.org/prop/statement/") + bind(substr(str(?ps),40) as ?id) + bind(substr(str(?p),30) as ?id) + bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?directProp) + } +''' + r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary) + return r.text + +def perform_sparql_update(sparql_endpoint, pwd, update_command): + # SPARQL Update requires HTTP POST + hdr = {'Content-Type' : 'application/sparql-update'} + r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command) + print(str(r.status_code) + ' ' + r.url) + print(r.text) + +# --------------- +# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint +# --------------- + +graph_text = retrieve_direct_statements(sparql_endpoint) +print('constructed triples retrieved') + +# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated +graph_text_list = graph_text.split('\n') +graph_text = '' +for line in graph_text_list: + try: + if line[0] != '@': + graph_text += line + '\n' + except: + pass + +# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph + +update_command = '''INSERT DATA +{ GRAPH <''' + graph_name + '''> { +''' + graph_text + ''' +}}''' + +pwd = load_credential(filename, directory) +perform_sparql_update(sparql_endpoint, pwd, update_command) + +print() +print('done') \ No newline at end of file diff --git a/vanderbot/vb6_upload_wikidata.py b/vanderbot/vb6_upload_wikidata.py index d3e812ea..4778c118 100644 --- a/vanderbot/vb6_upload_wikidata.py +++ b/vanderbot/vb6_upload_wikidata.py @@ -57,7 +57,11 @@ # - This requires adding the correct Content-Type header (application/sparql-query) # - Correct the form of the IRI for statements (add Q ID before UUID in IRI). This required a slight modification in the # part of the script that searches the mapping template for statements (look for -} instead of just } ) - +# ----------------------------------------- +# Version 1.4 change notes (2020-08-17): +# - In csv-metadata.json, replace wdt: namespace properties with ps: properties, +# e.g. https://github.com/HeardLibrary/linked-data/blob/v1-4/vanderbot/csv-metadata.json#L187 +# - Modify vb6_upload_wikidata.py (this script) to fine those ps: properties instead of the wdt: ones. import json import requests @@ -225,7 +229,6 @@ def findPropertyUuid(propertyId, columns): # find the valueUrl in the column for which the value of the statement has the prop version of the property as its propertyUrl if 'prop/' + propertyId in column['propertyUrl']: temp = column['valueUrl'].partition('-{')[2] - print(temp) statementUuidColumn = temp.partition('}')[0] # in the event of two columns with the same property ID, the last one is used #print(statementUuidColumn) @@ -679,10 +682,10 @@ def attemptPost(apiUrl, parameters): # find columns that contain properties with entity values or literal values that are URLs elif 'valueUrl' in column: - # only add columns that have direct properties - if 'prop/direct/' in column['propertyUrl']: + # only add columns that have "statement" properties + if 'prop/statement/' in column['propertyUrl']: propColumnHeader = column['titles'] - propertyId = column['propertyUrl'].partition('prop/direct/')[2] + propertyId = column['propertyUrl'].partition('prop/statement/')[2] propertiesColumnList.append(propColumnHeader) propertiesIdList.append(propertyId) @@ -707,10 +710,10 @@ def attemptPost(apiUrl, parameters): # remaining columns should have properties with literal values else: - # only add columns that have direct properties - if 'prop/direct/' in column['propertyUrl']: + # only add columns that have "statement" properties + if 'prop/statement/' in column['propertyUrl']: propColumnHeader = column['titles'] - propertyId = column['propertyUrl'].partition('prop/direct/')[2] + propertyId = column['propertyUrl'].partition('prop/statement/')[2] print('Property column: ', propColumnHeader, ', Property ID: ', propertyId, ' Value datatype: ', column['datatype']) propertiesColumnList.append(propColumnHeader) propertiesIdList.append(propertyId)