Skip to content

Commit

Permalink
Fixes to RDFa parsing. Implement versa.util.uniquify(). Implement JSO…
Browse files Browse the repository at this point in the history
…NLD writer.
  • Loading branch information
uogbuji committed Apr 18, 2018
1 parent 749a7e4 commit 9d8420b
Show file tree
Hide file tree
Showing 7 changed files with 276 additions and 31 deletions.
3 changes: 3 additions & 0 deletions tools/py/driver/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,13 @@ def match(self, origin=None, rel=None, target=None, attrs=None, include_ids=Fals
matches = True
if origin and origin != curr_rel[ORIGIN]:
matches = False
continue
if rel and rel != curr_rel[RELATIONSHIP]:
matches = False
continue
if target and target != curr_rel[TARGET]:
matches = False
continue
if attrs:
for k, v in attrs.items():
if k not in curr_rel[ATTRIBUTES] or curr_rel[ATTRIBUTES].get(k) != v:
Expand Down
62 changes: 35 additions & 27 deletions tools/py/reader/rdfalite.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,14 @@
#from versa.writer.rdfs import prep as statement_prep
from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET, ATTRIBUTES

from versa.reader import statement_prep, dumb_triples, rdfize, versalinks

try:
from rdflib import BNode as bnode
RDFLIB_AVAILABLE = True
except:
def bnode(object):
pass
RDFLIB_AVAILABLE = False
from . import statement_prep, dumb_triples, rdfize, versalinks
from versa.writer.rdf import mock_bnode, prep, RDF_TYPE, RDF_NS

from amara3 import iri
from amara3.uxml import tree
from amara3.uxml.treeutil import *
from amara3.uxml import html5

RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
#SCHEMAORG_NS is proper name. Deprecate the other
SCHEMAORG_NS = SCHEMA_NS = 'http://schema.org/'
FOAF_NS = 'http://xmlns.com/foaf/0.1/'
Expand All @@ -44,8 +36,6 @@ def bnode(object):
if verbose:
logger.setLevel(logging.DEBUG)

BNODE_ROOT = 'urn:amara-bnode:_'

def toversa(htmlsource, model, source_uri):
'''
>>> import urllib
Expand Down Expand Up @@ -85,9 +75,7 @@ def parse(htmlsource, statement_sink, source_uri):
'''
root = html5.parse(htmlsource)

g_bnode_counter = 1
def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
nonlocal g_bnode_counter
prefixes = prefixes or DEFAULT_PREFIXES.copy()
vocab = elem.xml_attributes.get('vocab', vocab)
#element_satisfied = False
Expand All @@ -114,24 +102,23 @@ def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
try:
resource = new_resource = I(iri.absolutize(new_resource, source_uri))
except ValueError:
warnings.warn('Invalid URL or anchor {} found in {}'.format(new_resource, source_uri))
warnings.warn('Invalid URL or anchor {} found in {}. Ignored.'.format(new_resource, source_uri))
new_resource = None

typeof_list = elem.xml_attributes.get('typeof')
if typeof_list:
if not new_resource: new_resource = mock_bnode('')
for typeof in typeof_list.split():
typeof = I(iri.absolutize(typeof, vocab))
try:
typeof = I(iri.absolutize(typeof, vocab))
except ValueError:
warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(typeof, source_uri))
statement_sink.send((new_resource or resource, RDF_NS + 'type', typeof))

new_prop_list = elem.xml_attributes.get('property')
new_value = None
if new_prop_list:
#FIXME: Should this only be when about is used?
if typeof_list and not new_resource:
new_value = bnode()
#new_value = I(BNODE_ROOT + str(g_bnode_counter))
#g_bnode_counter += 1
elif new_resource:
if new_resource:
new_value = new_resource
for new_prop in new_prop_list.split():
if new_prop == 'about':
Expand All @@ -141,13 +128,34 @@ def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
if not p in prefixes:
#FIXME: Silent error for now
continue
prop = I(iri.absolutize(local, prefixes[p]))
try:
prop = I(iri.absolutize(local, prefixes[p]))
except ValueError:
warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(local, source_uri))
continue
else:
prop = I(iri.absolutize(new_prop, vocab))
value = new_value or elem.xml_attributes.get('content') or elem.xml_attributes.get('href') or elem.xml_attributes.get('src') or elem.xml_value
try:
prop = I(iri.absolutize(new_prop, vocab))
except ValueError:
warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(new_prop, source_uri))
continue
href_res = elem.xml_attributes.get('href')
if href_res:
try:
href_res = I(href_res)
except ValueError:
warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_res, source_uri))
continue
href_src = elem.xml_attributes.get('src')
if href_src:
try:
href_src = I(href_src)
except ValueError:
warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_src, source_uri))
continue
value = new_value or elem.xml_attributes.get('content') or href_res or href_src or elem.xml_value
statement_sink.send((resource, prop, value))
#print((resource, prop, value))
logging.debug('{}'.format((resource, prop, value)))
#logging.debug('{}'.format((resource, prop, value)))
#element_satisfied = True
if new_value: resource = new_value
for child in elem.xml_children:
Expand Down
4 changes: 2 additions & 2 deletions tools/py/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
RDFS_NS = 'http://www.w3.org/2000/01/rdf-schema#'

VERSA_TYPE_REL = I(VERSA_BASEIRI + 'type')
RDF_TYPE_REL = I(RDF_NS + 'type')
VERSA_TYPE = VERSA_TYPE_REL = I(VERSA_BASEIRI + 'type')
RDF_TYPE = RDF_TYPE_REL = I(RDF_NS + 'type')

20 changes: 18 additions & 2 deletions tools/py/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,29 @@ def duplicate_statements(model, oldorigin, neworigin, rfilter=None):
:param newres: origin resource IRI for duplication
:return: None
'''
for link in model.match(oldorigin):
o, r, t, a = link
for o, r, t, a in model.match(oldorigin):
if rfilter is None or rfilter(o, r, t, a):
model.add(I(neworigin), r, t, a)
return


def uniquify(model):
'''
Remove all duplicate relationships
'''
seen = set()
to_remove = set()
for ix, (o, r, t, a) in model:
hashable_link = (o, r, t) + tuple(sorted(a.items()))
#print(hashable_link)
if hashable_link in seen:
to_remove.add(ix)
seen.add(hashable_link)

model.remove(to_remove)
return


def jsonload(model, fp):
'''
Load Versa model dumped into JSON form, either raw or canonical
Expand Down
29 changes: 29 additions & 0 deletions tools/py/writer/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""

import logging
import operator

from amara3 import iri

Expand All @@ -21,6 +22,17 @@ def fromlist(l):
return '|'.join(l)


def omap(m):
'''
Create a nested mapping from origin to property to values/attributes covering an entire Versa model
'''
om = {}
for s, p, o, a in m.match():
om.setdefault(s, {})
om[s].setdefault(p, []).append((o, a))
return om


def write(models, csvout, rulelist, write_header, base=None, logger=logging):
'''
models - one or more input Versa models from which output is generated.
Expand All @@ -35,6 +47,23 @@ def write(models, csvout, rulelist, write_header, base=None, logger=logging):

if not isinstance(models, list): models = [models]
for m in models:
mapped = omap(m)
for o, props in mapped.items():
rtypes = list(map(operator.itemgetter(0), props.get(RDF_TYPE_REL, [])))
if not rtypes: continue
#print('RES TYPES:', rtypes)
row = [o, fromlist(rtypes)] + [None] * numprops
for ix, p in enumerate(properties):
v = list(map(operator.itemgetter(0), props.get(p, [])))
if v:
row[ix + 2] = fromlist(v)
csvout.writerow(row)

return


def IGNORE():
if False:
for rid in all_origins(m):
#print(rid, list(m.match(rid, RDF_TYPE_REL)))
rtypes = list(lookup(m, rid, RDF_TYPE_REL))
Expand Down
102 changes: 102 additions & 0 deletions tools/py/writer/jsonld.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#versa.writer.ntriples
"""
Render a Versa vocab model as JSON-LD
"""

import logging

from amara3 import iri

from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET
from versa.terms import VERSA_BASEIRI, RDF_NS, RDFS_NS, VERSA_TYPE, RDF_TYPE
from versa.driver import memory
from versa import VERSA_BASEIRI
from versa.util import all_origins, lookup

def bind(models, context=None, ignore_oftypes=None, logger=logging):
if not isinstance(models, list): models = [models]
vocab = context.get('@vocab')
non_top_ids = set()
obj_pool = {} #Mapping from resource id to object and list of referring ids
used_objects = set() #Track multiple instance of docs to prevent data structure recursion
#typed_origins = set()
for m in models:
#Everything with a type
for origin in all_origins(m):
typ = next(lookup(m, origin, RDF_TYPE), None)
#if p == VERSA_TYPE: p = RDF_TYPE
obj, referents = obj_pool.setdefault(origin, ({}, []))
if vocab and typ:
typ_rel = iri.relativize(typ, vocab)
if typ_rel: typ = typ_rel
if typ: obj['@type'] = typ
if not origin.startswith('__VERSABLANKNODE__'): obj['@id'] = origin
for o, r, t, a in m.match(origin):
if r == RDF_TYPE: continue
if isinstance(t, I) and o != t:
if vocab:
t_rel = iri.relativize(t, vocab)
if t_rel: t = t_rel
valobj, referents = obj_pool.setdefault(t, ({}, []))
if t in used_objects:
val = t
else:
val = valobj
if not t.startswith('__VERSABLANKNODE__') and '@id' not in val: val['@id'] = t
used_objects.add(t)

non_top_ids.add(t) #If something has an object as a value it does not appear at the top
referents.append(o)
else:
val = t
if vocab:
r_rel = iri.relativize(r, vocab)
if r_rel: r = r_rel
if r in obj and isinstance(obj[r], list):
obj[r].append(val)
elif r in obj:
obj[r] = [obj[r], val]
else:
obj[r] = val

#Eliminate objects of types to be ignored
to_remove = []
for (oid, (obj, referents)) in obj_pool.items():
typ = obj.get('@type')
if vocab and typ: typ = iri.absolutize(typ, vocab)
if typ in ignore_oftypes:
to_remove.append(oid)
for ref in referents:
refobj = obj_pool[ref]
for k in list(refobj.keys()):
v = refobj[k]
if isinstance(v, list) and obj in v:
v.remove(obj)
if len(v) == 1:
refobj[k] = v[0]
elif v == obj:
del refobj[k]

for k in to_remove:
del obj_pool[k]

#Handle @id only
for (oid, (obj, referents)) in obj_pool.items():
for k, v in obj.items():
if len(v) == 1 and '@id' in v:
obj[k] = v['@id']

top_objs = [ obj for (k, (obj, refs)) in obj_pool.items() if k not in non_top_ids ]
#Eliminate stranded top-level objects with no more than type
to_remove = []
#for ix, obj in enumerate(top_objs):
for obj in top_objs:
if len(obj) == 1 and '@type' in obj:
to_remove.append(obj)
for obj in to_remove:
top_objs.remove(obj)
#import pprint;pprint.pprint(top_objs)
top = {'@context': context, '@graph': top_objs}
return top

Loading

0 comments on commit 9d8420b

Please sign in to comment.