Skip to content

Commit

Permalink
Merge pull request #8 from HeardLibrary/v1-4
Browse files Browse the repository at this point in the history
v1.4 release
  • Loading branch information
Steve Baskauf authored Aug 18, 2020
2 parents cc7609d + 3571972 commit 2086028
Show file tree
Hide file tree
Showing 5 changed files with 317 additions and 46 deletions.
10 changes: 8 additions & 2 deletions vanderbot/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Here are some queries that can be run to explore the data:

[Number of clinical trials at Vanderbilt by principal investigator](https://w.wiki/XKK)

The current release is [v1.3](https://github.com/HeardLibrary/linked-data/releases/tag/v1.3).
The current release is [v1.4](https://github.com/HeardLibrary/linked-data/releases/tag/v1.4).

## How it works

Expand Down Expand Up @@ -138,5 +138,11 @@ In the case where there are no reference properties, there also isn't any refere
If there are reference property combinations other than this, the `generate_statement_data()` function can't be used and custom code must be written for that statement.


## Release v1.4 (2020-08-17) notes

The changes made in this release were made following tests that used the `csv-metadata.json` mapping schema to emit RDF from the source CSV tables. In order to make it possible to create all of the kinds of statements present in the Wikidata data model, the `csv-metadata.json` file and `vb6_upload_wikidata.py` script were changed to use the `ps:` namespace (`http://www.wikidata.org/prop/statement/`) properties rather than the `wdt:` namespace properties. This makes it possible to construct the missing `wdt:` statements using SPARQL CONSTRUCT. [A new script](https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/generate_direct_props.py) materializes those triples by a CONSTRUCT query to a SPARQL endpoint whose triplestore contains the triples generated by the schema. Those materialized triples are then loaded into the triplestore, making it possible to perform queries on any graph pattern that can be used at the Wikidata Query Service SPARQL endpoint.

The first five scripts were not changed in this release.

----
Revised 2020-04-23
Revised 2020-08-17
44 changes: 8 additions & 36 deletions vanderbot/csv-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@
"titles": "orcid",
"name": "orcid",
"datatype": "string",
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
"propertyUrl": "http://www.wikidata.org/prop/direct/P496"
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{orcidStatementUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P496"
},
{
"titles": "orcidReferenceHash",
Expand Down Expand Up @@ -89,8 +89,8 @@
"titles": "employer",
"name": "employer",
"datatype": "string",
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
"propertyUrl": "http://www.wikidata.org/prop/direct/P108",
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P108",
"valueUrl": "http://www.wikidata.org/entity/{employer}"
},
{
Expand Down Expand Up @@ -128,8 +128,8 @@
"titles": "affiliation",
"name": "affiliation",
"datatype": "string",
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
"propertyUrl": "http://www.wikidata.org/prop/direct/P1416",
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P1416",
"valueUrl": "http://www.wikidata.org/entity/{affiliation}"
},
{
Expand Down Expand Up @@ -167,8 +167,8 @@
"titles": "instanceOf",
"name": "instanceOf",
"datatype": "string",
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
"propertyUrl": "http://www.wikidata.org/prop/direct/P31",
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P31",
"valueUrl": "http://www.wikidata.org/entity/{instanceOf}"
},
{
Expand All @@ -183,34 +183,6 @@
"titles": "sexOrGenderQId",
"name": "sexOrGenderQId",
"datatype": "string",
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
"propertyUrl": "http://www.wikidata.org/prop/direct/P21",
"valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}"
},
{
"name": "employerPropertyStatement",
"virtual": true,
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P108",
"valueUrl": "http://www.wikidata.org/entity/{employer}"
},
{
"name": "affiliationPropertyStatement",
"virtual": true,
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P1416",
"valueUrl": "http://www.wikidata.org/entity/{affiliation}"
},
{
"name": "instanceOfPropertyStatement",
"virtual": true,
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P31",
"valueUrl": "http://www.wikidata.org/entity/{instanceOf}"
},
{
"name": "sexOrGenderPropertyStatement",
"virtual": true,
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{sexOrGenderUuid}",
"propertyUrl": "http://www.wikidata.org/prop/statement/P21",
"valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}"
Expand Down
189 changes: 189 additions & 0 deletions vanderbot/generate_direct_props.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import requests # best library to manage HTTP transactions\n",
"from bs4 import BeautifulSoup # web-scraping library\n",
"import json\n",
"from time import sleep\n",
"import csv\n",
"import math\n",
"from fuzzywuzzy import fuzz # fuzzy logic matching\n",
"from fuzzywuzzy import process\n",
"import xml.etree.ElementTree as et # library to traverse XML tree\n",
"import urllib\n",
"import datetime\n",
"import string\n",
"from pathlib import Path\n",
"\n",
"# ---------------\n",
"# Configuration data\n",
"# ---------------\n",
"\n",
"graph_name = 'http://nursing'\n",
"accept_media_type = 'text/turtle'\n",
"sparql_endpoint = \"https://sparql.vanderbilt.edu/sparql\"\n",
"request_header_dictionary = {\n",
" #'Content-Type': 'application/sparql-query',\n",
" 'Accept' : accept_media_type\n",
"}\n",
"\n",
"# Load endpoint password from file in home directory\n",
"directory = 'home'\n",
"filename = 'sparql_vanderbilt_edu_password.txt'\n",
"pwd = load_credential(filename, directory)\n",
"\n",
"# ---------------\n",
"# Function definitions\n",
"# ---------------\n",
"\n",
"# Load password from local drive\n",
"# value of directory should be either 'home' or 'working'\n",
"def load_credential(filename, directory):\n",
" cred = ''\n",
" # to change the script to look for the credential in the working directory, change the value of home to empty string\n",
" if directory == 'home':\n",
" home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac\n",
" credential_path = home + '/' + filename\n",
" else:\n",
" directory = 'working'\n",
" credential_path = filename\n",
" try:\n",
" with open(credential_path, 'rt', encoding='utf-8') as file_object:\n",
" cred = file_object.read()\n",
" except:\n",
" print(filename + ' file not found - is it in your ' + directory + ' directory?')\n",
" exit()\n",
" return(cred)\n",
"\n",
"def retrieve_direct_statements(sparql_endpoint):\n",
" query = '''\n",
"construct {?item ?directProp ?value.}\n",
"from <''' + graph_name + '''>\n",
"where {\n",
" ?item ?p ?statement.\n",
" ?statement ?ps ?value.\n",
" filter(substr(str(?ps),1,39)=\"http://www.wikidata.org/prop/statement/\")\n",
" bind(substr(str(?ps),40) as ?id)\n",
" bind(substr(str(?p),30) as ?id)\n",
" bind(iri(concat(\"http://www.wikidata.org/prop/direct/\", ?id)) as ?directProp)\n",
" }\n",
"'''\n",
" results = []\n",
" r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)\n",
" return r.text\n",
"\n",
"def perform_sparql_update(sparql_endpoint, pwd, update_command):\n",
" # SPARQL Update requires HTTP POST\n",
" hdr = {'Content-Type' : 'application/sparql-update'}\n",
" r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)\n",
" print(str(r.status_code) + ' ' + r.url)\n",
" print(r.text)\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# ---------------\n",
"# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint \n",
"# ---------------\n",
"\n",
"graph_text = retrieve_direct_statements(sparql_endpoint)\n",
"#print(graph_text)\n",
"print('constructed triples retrieved')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated\n",
"graph_text_list = graph_text.split('\\n')\n",
"# print(graph_text_list)\n",
"graph_text = ''\n",
"for line in graph_text_list:\n",
" try:\n",
" if line[0] != '@':\n",
" graph_text += line + '\\n'\n",
" except:\n",
" pass\n",
"#print()\n",
"#print(graph_text)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"200 https://sparql.vanderbilt.edu/sparql\n",
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><head><meta http-equiv=\"Content-Type\" content=\"text&#47;html;charset=UTF-8\"><title>blazegraph&trade; by SYSTAP</title\n",
"></head\n",
"><body<p>totalElapsed=1ms, elapsed=1ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p\n",
"><hr><p>COMMIT: totalElapsed=356ms, commitTime=1596944443099, mutationCount=776</p\n",
"></html\n",
">\n",
"\n",
"done\n"
]
}
],
"source": [
"# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph\n",
"\n",
"update_command = '''INSERT DATA\n",
"{ GRAPH <''' + graph_name + '''> { \n",
"''' + graph_text + '''\n",
"}}'''\n",
"\n",
"#print(update_command)\n",
"\n",
"perform_sparql_update(sparql_endpoint, pwd, update_command)\n",
"\n",
"print()\n",
"print('done')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
101 changes: 101 additions & 0 deletions vanderbot/generate_direct_props.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import requests # best library to manage HTTP transactions
import json
from time import sleep
import csv
import math
import urllib
import datetime
import string
from pathlib import Path

# ---------------
# Configuration data
# ---------------

graph_name = 'http://nursing'
accept_media_type = 'text/turtle'
sparql_endpoint = "https://sparql.vanderbilt.edu/sparql"
request_header_dictionary = {
#'Content-Type': 'application/sparql-query',
'Accept' : accept_media_type
}

# Load endpoint password from file in home directory
directory = 'home'
filename = 'sparql_vanderbilt_edu_password.txt'

# ---------------
# Function definitions
# ---------------

# Load password from local drive
# value of directory should be either 'home' or 'working'
def load_credential(filename, directory):
cred = ''
# to change the script to look for the credential in the working directory, change the value of home to empty string
if directory == 'home':
home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
credential_path = home + '/' + filename
else:
directory = 'working'
credential_path = filename
try:
with open(credential_path, 'rt', encoding='utf-8') as file_object:
cred = file_object.read()
except:
print(filename + ' file not found - is it in your ' + directory + ' directory?')
exit()
return(cred)

def retrieve_direct_statements(sparql_endpoint):
query = '''
construct {?item ?directProp ?value.}
from <''' + graph_name + '''>
where {
?item ?p ?statement.
?statement ?ps ?value.
filter(substr(str(?ps),1,39)="http://www.wikidata.org/prop/statement/")
bind(substr(str(?ps),40) as ?id)
bind(substr(str(?p),30) as ?id)
bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?directProp)
}
'''
r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)
return r.text

def perform_sparql_update(sparql_endpoint, pwd, update_command):
# SPARQL Update requires HTTP POST
hdr = {'Content-Type' : 'application/sparql-update'}
r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)
print(str(r.status_code) + ' ' + r.url)
print(r.text)

# ---------------
# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint
# ---------------

graph_text = retrieve_direct_statements(sparql_endpoint)
print('constructed triples retrieved')

# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated
graph_text_list = graph_text.split('\n')
graph_text = ''
for line in graph_text_list:
try:
if line[0] != '@':
graph_text += line + '\n'
except:
pass

# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph

update_command = '''INSERT DATA
{ GRAPH <''' + graph_name + '''> {
''' + graph_text + '''
}}'''

pwd = load_credential(filename, directory)
perform_sparql_update(sparql_endpoint, pwd, update_command)

print()
print('done')
Loading

0 comments on commit 2086028

Please sign in to comment.