Fixes to RDFa parsing. Implement versa.util.uniquify(). Implement JSO…

…NLD writer.
uogbuji · Apr 18, 2018 · 9d8420b · 9d8420b
1 parent 749a7e4
commit 9d8420b
Show file tree

Hide file tree

Showing 7 changed files with 276 additions and 31 deletions.
diff --git a/tools/py/driver/memory.py b/tools/py/driver/memory.py
@@ -79,10 +79,13 @@ def match(self, origin=None, rel=None, target=None, attrs=None, include_ids=Fals
             matches = True
             if origin and origin != curr_rel[ORIGIN]:
                 matches = False
+                continue
             if rel and rel != curr_rel[RELATIONSHIP]:
                 matches = False
+                continue
             if target and target != curr_rel[TARGET]:
                 matches = False
+                continue
             if attrs:
                 for k, v in attrs.items():
                     if k not in curr_rel[ATTRIBUTES] or curr_rel[ATTRIBUTES].get(k) != v:

diff --git a/tools/py/reader/rdfalite.py b/tools/py/reader/rdfalite.py
@@ -9,22 +9,14 @@
 #from versa.writer.rdfs import prep as statement_prep
 from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET, ATTRIBUTES
 
-from versa.reader import statement_prep, dumb_triples, rdfize, versalinks
-
-try:
-    from rdflib import BNode as bnode
-    RDFLIB_AVAILABLE = True
-except:
-    def bnode(object):
-        pass
-    RDFLIB_AVAILABLE = False
+from . import statement_prep, dumb_triples, rdfize, versalinks
+from versa.writer.rdf import mock_bnode, prep, RDF_TYPE, RDF_NS
 
 from amara3 import iri
 from amara3.uxml import tree
 from amara3.uxml.treeutil import *
 from amara3.uxml import html5
 
-RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
 #SCHEMAORG_NS is proper name. Deprecate the other
 SCHEMAORG_NS = SCHEMA_NS = 'http://schema.org/'
 FOAF_NS = 'http://xmlns.com/foaf/0.1/'
@@ -44,8 +36,6 @@ def bnode(object):
 if verbose:
     logger.setLevel(logging.DEBUG)
 
-BNODE_ROOT = 'urn:amara-bnode:_'
-
 def toversa(htmlsource, model, source_uri):
     '''
     >>> import urllib
@@ -85,9 +75,7 @@ def parse(htmlsource, statement_sink, source_uri):
     '''
     root = html5.parse(htmlsource)
 
-    g_bnode_counter = 1
     def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
-        nonlocal g_bnode_counter
         prefixes = prefixes or DEFAULT_PREFIXES.copy()
         vocab = elem.xml_attributes.get('vocab', vocab)
         #element_satisfied = False
@@ -114,24 +102,23 @@ def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
                 try:
                     resource = new_resource = I(iri.absolutize(new_resource, source_uri))
                 except ValueError:
-                    warnings.warn('Invalid URL or anchor {} found in {}'.format(new_resource, source_uri))
+                    warnings.warn('Invalid URL or anchor {} found in {}. Ignored.'.format(new_resource, source_uri))
                     new_resource = None
 
             typeof_list = elem.xml_attributes.get('typeof')
             if typeof_list:
+                if not new_resource: new_resource = mock_bnode('')
                 for typeof in typeof_list.split():
-                    typeof = I(iri.absolutize(typeof, vocab))
+                    try:
+                        typeof = I(iri.absolutize(typeof, vocab))
+                    except ValueError:
+                        warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(typeof, source_uri))
                     statement_sink.send((new_resource or resource, RDF_NS + 'type', typeof))
 
             new_prop_list = elem.xml_attributes.get('property')
             new_value = None
             if new_prop_list:
-                #FIXME: Should this only be when about is used?
-                if typeof_list and not new_resource:
-                    new_value = bnode()
-                    #new_value = I(BNODE_ROOT + str(g_bnode_counter))
-                    #g_bnode_counter += 1
-                elif new_resource:
+                if new_resource:
                     new_value = new_resource
                 for new_prop in new_prop_list.split():
                     if new_prop == 'about':
@@ -141,13 +128,34 @@ def do_parse(elem, resource, vocab=None, prop=None, prefixes=None):
                         if not p in prefixes:
                             #FIXME: Silent error for now
                             continue
-                        prop = I(iri.absolutize(local, prefixes[p]))
+                        try:
+                            prop = I(iri.absolutize(local, prefixes[p]))
+                        except ValueError:
+                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(local, source_uri))
+                            continue
                     else:
-                        prop = I(iri.absolutize(new_prop, vocab))
-                    value = new_value or elem.xml_attributes.get('content') or elem.xml_attributes.get('href') or elem.xml_attributes.get('src') or elem.xml_value
+                        try:
+                            prop = I(iri.absolutize(new_prop, vocab))
+                        except ValueError:
+                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(new_prop, source_uri))
+                            continue
+                    href_res = elem.xml_attributes.get('href')
+                    if href_res:
+                        try:
+                            href_res = I(href_res)
+                        except ValueError:
+                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_res, source_uri))
+                            continue
+                    href_src = elem.xml_attributes.get('src')
+                    if href_src:
+                        try:
+                            href_src = I(href_src)
+                        except ValueError:
+                            warnings.warn('Invalid URL or anchor {} found in {}. Ignored'.format(href_src, source_uri))
+                            continue
+                    value = new_value or elem.xml_attributes.get('content') or href_res or href_src or elem.xml_value
                     statement_sink.send((resource, prop, value))
-                    #print((resource, prop, value))
-                    logging.debug('{}'.format((resource, prop, value)))
+                    #logging.debug('{}'.format((resource, prop, value)))
                     #element_satisfied = True
             if new_value: resource = new_value
         for child in elem.xml_children:

diff --git a/tools/py/terms.py b/tools/py/terms.py
@@ -6,6 +6,6 @@
 RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
 RDFS_NS = 'http://www.w3.org/2000/01/rdf-schema#'
 
-VERSA_TYPE_REL = I(VERSA_BASEIRI + 'type')
-RDF_TYPE_REL = I(RDF_NS + 'type')
+VERSA_TYPE = VERSA_TYPE_REL = I(VERSA_BASEIRI + 'type')
+RDF_TYPE = RDF_TYPE_REL = I(RDF_NS + 'type')
 
diff --git a/tools/py/util.py b/tools/py/util.py
@@ -121,13 +121,29 @@ def duplicate_statements(model, oldorigin, neworigin, rfilter=None):
     :param newres: origin resource IRI for duplication
     :return: None
     '''
-    for link in model.match(oldorigin):
-        o, r, t, a = link
+    for o, r, t, a in model.match(oldorigin):
         if rfilter is None or rfilter(o, r, t, a):
             model.add(I(neworigin), r, t, a)
     return
 
 
+def uniquify(model):
+    '''
+    Remove all duplicate relationships
+    '''
+    seen = set()
+    to_remove = set()
+    for ix, (o, r, t, a) in model:
+        hashable_link = (o, r, t) + tuple(sorted(a.items()))
+        #print(hashable_link)
+        if hashable_link in seen:
+            to_remove.add(ix)
+        seen.add(hashable_link)
+
+    model.remove(to_remove)
+    return
+
+
 def jsonload(model, fp):
     '''
     Load Versa model dumped into JSON form, either raw or canonical

diff --git a/tools/py/writer/csv.py b/tools/py/writer/csv.py
@@ -9,6 +9,7 @@
 """
 
 import logging
+import operator
 
 from amara3 import iri
 
@@ -21,6 +22,17 @@ def fromlist(l):
     return '|'.join(l)
 
 
+def omap(m):
+    '''
+    Create a nested mapping from origin to property to values/attributes covering an entire Versa model
+    '''
+    om = {}
+    for s, p, o, a in m.match():
+        om.setdefault(s, {})
+        om[s].setdefault(p, []).append((o, a))
+    return om
+
+
 def write(models, csvout, rulelist, write_header, base=None, logger=logging):
     '''
     models - one or more input Versa models from which output is generated.
@@ -35,6 +47,23 @@ def write(models, csvout, rulelist, write_header, base=None, logger=logging):
 
     if not isinstance(models, list): models = [models]
     for m in models:
+        mapped = omap(m)
+        for o, props in mapped.items():
+            rtypes = list(map(operator.itemgetter(0), props.get(RDF_TYPE_REL, [])))
+            if not rtypes: continue
+            #print('RES TYPES:', rtypes)
+            row = [o, fromlist(rtypes)] + [None] * numprops
+            for ix, p in enumerate(properties):
+                v = list(map(operator.itemgetter(0), props.get(p, [])))
+                if v:
+                    row[ix + 2] = fromlist(v)
+                    csvout.writerow(row)
+
+    return
+
+
+def IGNORE():
+    if False:
         for rid in all_origins(m):
             #print(rid, list(m.match(rid, RDF_TYPE_REL)))
             rtypes = list(lookup(m, rid, RDF_TYPE_REL))

diff --git a/tools/py/writer/jsonld.py b/tools/py/writer/jsonld.py
@@ -0,0 +1,102 @@
+#versa.writer.ntriples
+"""
+Render a Versa vocab model as JSON-LD
+
+"""
+
+import logging
+
+from amara3 import iri
+
+from versa import I, VERSA_BASEIRI, ORIGIN, RELATIONSHIP, TARGET
+from versa.terms import VERSA_BASEIRI, RDF_NS, RDFS_NS, VERSA_TYPE, RDF_TYPE
+from versa.driver import memory
+from versa import VERSA_BASEIRI
+from versa.util import all_origins, lookup
+
+def bind(models, context=None, ignore_oftypes=None, logger=logging):
+    if not isinstance(models, list): models = [models]
+    vocab = context.get('@vocab')
+    non_top_ids = set()
+    obj_pool = {} #Mapping from resource id to object and list of referring ids
+    used_objects = set() #Track multiple instance of docs to prevent data structure recursion
+    #typed_origins = set()
+    for m in models:
+        #Everything with a type
+        for origin in all_origins(m):
+            typ = next(lookup(m, origin, RDF_TYPE), None)
+            #if p == VERSA_TYPE: p = RDF_TYPE
+            obj, referents = obj_pool.setdefault(origin, ({}, []))
+            if vocab and typ:
+                typ_rel = iri.relativize(typ, vocab)
+                if typ_rel: typ = typ_rel
+            if typ: obj['@type'] = typ
+            if not origin.startswith('__VERSABLANKNODE__'): obj['@id'] = origin
+            for o, r, t, a in m.match(origin):
+                if r == RDF_TYPE: continue
+                if isinstance(t, I) and o != t:
+                    if vocab:
+                        t_rel = iri.relativize(t, vocab)
+                        if t_rel: t = t_rel
+                    valobj, referents = obj_pool.setdefault(t, ({}, []))
+                    if t in used_objects:
+                        val = t
+                    else:
+                        val = valobj
+                        if not t.startswith('__VERSABLANKNODE__') and '@id' not in val: val['@id'] = t
+                        used_objects.add(t)
+
+                        non_top_ids.add(t) #If something has an object as a value it does not appear at the top
+                    referents.append(o)
+                else:
+                    val = t
+                if vocab:
+                    r_rel = iri.relativize(r, vocab)
+                    if r_rel: r = r_rel
+                if r in obj and isinstance(obj[r], list):
+                    obj[r].append(val)
+                elif r in obj:
+                    obj[r] = [obj[r], val]
+                else:
+                    obj[r] = val
+
+    #Eliminate objects of types to be ignored
+    to_remove = []
+    for (oid, (obj, referents)) in obj_pool.items():
+        typ = obj.get('@type')
+        if vocab and typ: typ = iri.absolutize(typ, vocab)
+        if typ in ignore_oftypes:
+            to_remove.append(oid)
+            for ref in referents:
+                refobj = obj_pool[ref]
+                for k in list(refobj.keys()):
+                    v = refobj[k]
+                    if isinstance(v, list) and obj in v:
+                        v.remove(obj)
+                        if len(v) == 1:
+                            refobj[k] = v[0]
+                    elif v == obj:
+                        del refobj[k]
+
+    for k in to_remove:
+        del obj_pool[k]
+
+    #Handle @id only
+    for (oid, (obj, referents)) in obj_pool.items():
+        for k, v in obj.items():
+            if len(v) == 1 and '@id' in v:
+                obj[k] = v['@id']
+
+    top_objs = [ obj for (k, (obj, refs)) in obj_pool.items() if k not in non_top_ids ]
+    #Eliminate stranded top-level objects with no more than type
+    to_remove = []
+    #for ix, obj in enumerate(top_objs):
+    for obj in top_objs:
+        if len(obj) == 1 and '@type' in obj:
+            to_remove.append(obj)
+    for obj in to_remove:
+        top_objs.remove(obj)
+    #import pprint;pprint.pprint(top_objs)
+    top = {'@context': context, '@graph': top_objs}
+    return top
+