From 3fb4f21063578611653b6da713e94f51e5313eb9 Mon Sep 17 00:00:00 2001 From: alexis mcclimans Date: Sun, 1 Dec 2019 10:37:39 -0800 Subject: [PATCH] add schema module --- README.rst | 68 +++++++++-- setup.py | 2 +- shaclgen/__init__.py | 7 ++ shaclgen/__main__.py | 75 ++++++++---- shaclgen/schema.py | 40 ++++--- shaclgen/shaclgen.py | 279 ++++++++++++++++++++----------------------- 6 files changed, 268 insertions(+), 203 deletions(-) create mode 100644 shaclgen/__init__.py diff --git a/README.rst b/README.rst index 46172c0..133586b 100644 --- a/README.rst +++ b/README.rst @@ -1,25 +1,69 @@ -shaclgen -======== +SHACLGEN +=============== -shaclgen generates shacl templates based on the properties and classes -present in a graph. This module uses the rdflib library for working with -rdf. +Shaclgen takes either a data graph(s) or schema(s) as input and generates a basic shape file based on the classes and properties present. -From the command line: -~~~~~~~~~~~~~~~~~~~~~~ +**Shape files from data graphs:** +By default, the input graph is processed as a data graph (instance triples). Three formats are possible for data graphs: simple, nested, and extended. +- Simple: Each class and property generate individual Node- and PropertyShapes. + +- Nested: Property shapes will be nested in nodeshapes iif they occur with one class. + +- Extended: Expands nested shapes to create individual property shapes for each property, in addition to nesting them when appropriate. + +**Shape files from ontologies:** +If the input is a schema or ontology, shaclgen generates a nested shape file: properties with rdfs:domain defined in the ontology will be nested within the appropriate NodeShape. rdfs:range definitions for XML and rdfs datatypes are included. + +Added support for OWL constructions is planned. + +*************** + + + + +Installation +*************** +Using pip: +:: + + pip install shaclgen + +From source: + +https://github.com/alexiskeely/shaclgen + + +Command line use: +***************** :: - $ shaclgen [uri to data] [serialization] + $ shaclgen graph [optional arguments] + +Example usage: +:: -Supported serializations include: - ``ttl`` for turtle - ``xml`` for -rdf/xml - ``nt`` for ntriples + $ shaclgen https://www.lib.washington.edu/static/public/cams/data/datasets/uwSemWebParts/webResource-1-0-0.nt -example: + +Command line arguments: :: - $ shaclgen https://www.lib.washington.edu/static/public/cams/data/datasets/uwSemWebParts/aggregation-1-0-0.ttl ttl + positional arguments: + graph The data graph(s). + +:: + + optional arguments: + -h, --help show this help message and exit + -nf, --nested generates a nested shape file + -ef, --extended generates an expanded shape file + -o, --ontology input file(s) or URL(s) is a schema or ontology + -s SERIALIZATION, --serialization SERIALIZATION + result graph serialization, default is turtle + +*************** This project is still in development. Comments, questions, and issues are welcome! diff --git a/setup.py b/setup.py index 9ea8172..bb7ef00 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name = 'shaclgen', - version = '0.1.3', + version = '0.1.4', packages = ['shaclgen'], description='Shacl graph generator', long_description=l_description, diff --git a/shaclgen/__init__.py b/shaclgen/__init__.py new file mode 100644 index 0000000..738b8d6 --- /dev/null +++ b/shaclgen/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Nov 30 13:30:18 2019 + +@author: alexi +""" + diff --git a/shaclgen/__main__.py b/shaclgen/__main__.py index 4fd8e78..54ac0ba 100644 --- a/shaclgen/__main__.py +++ b/shaclgen/__main__.py @@ -2,38 +2,69 @@ #!/usr/bin/env python #%% -from .shaclgen import generate_groups, generate_triples, generate_shacl,generate_merged +from .shaclgen import data_graph +from .schema import schema + import argparse +from argparse import RawDescriptionHelpFormatter +from argparse import RawTextHelpFormatter -parser = argparse.ArgumentParser(description=""" - Shacl file generator. - Shaclgen will create a simple shape file by default: - every class and property will get their own shape. - Nested and extended shape files are possible.""") +parser = argparse.ArgumentParser( + formatter_class=RawDescriptionHelpFormatter, + description=(""" + ---------------------------Shaclgen------------------------------------------ + + Shaclgen takes either a data graph(s) or schema(s) as input and generates + a basic shape file based on the classes and properties present. + + Shape files from data graphs: + By default, the input graph is processed as a data graph (instance triples). + Three formats are possible for data grapghs: simple, nested, and extended. + + Simple: Each class and property generate individual Node- and PropertyShapes + Nested: Property shapes will be nested in nodeshapes iif + they occur with one class. + Extended: Expands nested shapes to create individual property shapes for each + property, in addition to nesting them when appropriate. + + Shape files from ontologies: + If the input is a schema or ontology (-o), shaclgen will generate + a nested shape file: properties with rdfs:domain defined in the ontology + will be nested within the appropriate NodeShape. rdfs:range definitions + for XML and rdfs datatypes are included.""")) -parser.add_argument("graph", nargs='+',type=str, help="the data graph(s)") -parser.add_argument("serialization", type=str,help="the data graph rdf serialization") +parser.add_argument("graph", nargs='+',type=str, help="The data graph(s).") + group = parser.add_mutually_exclusive_group() -group.add_argument("-nf", "--nested", action="store_true", help='Property shapes will be nested in nodeshapes iif they occur with one class.') -group.add_argument("-ef", "--extended", action="store_true", help='Expands nested shapes to create individual property shapes for each property, in addition to nesting them when appropriate.') +group.add_argument("-nf", "--nested", action="store_true", help='generates a nested shape file') +group.add_argument("-ef", "--extended", action="store_true", help='generates an expanded shape file') +parser.add_argument("-o", "--ontology", action="store_true", help='input file(s) or URL(s) is a schema or ontology') +parser.add_argument("-s", "--serialization", help='result graph serialization, default is turtle') args = parser.parse_args() -# -#print(args.graph[0]) -#print(args.serialization) + def main(): - output = generate_groups(args.graph, args.serialization) - if args.nested: - triples = generate_triples(output, 'nf') - elif args.extended: - triples = generate_triples(output, 'ef') + if args.ontology: + g = schema(args.graph) + if args.serialization: + print('...generating schema shape file...\n') + g.gen_graph(args.serialization) + else: + print('...generating schema shape file...\n') + g.gen_graph('turtle') else: - triples = generate_triples(output, 'sf') + kwargs = {'serial': 'turtle'} + g = data_graph(args.graph) + if args.nested: + kwargs['graph_format'] = 'nf' + elif args.extended: + kwargs['graph_format'] = 'ef' + if args.serialization: + kwargs['serial'] = args.serialization + print('...generating data shape file...\n') + g.gen_graph(**kwargs) - graph = generate_shacl(triples) - print(graph) -# if __name__ == '__main__': main() diff --git a/shaclgen/schema.py b/shaclgen/schema.py index d90b0d7..40ff239 100644 --- a/shaclgen/schema.py +++ b/shaclgen/schema.py @@ -7,9 +7,10 @@ import collections class schema(): - def __init__(self, graph=None,): + def __init__(self, args:list): self.G = Graph() - self.G.load(graph,format=guess_format(graph)) + for graph in args: + self.G.parse(graph,format=guess_format(graph)) self.CLASSES = collections.OrderedDict() self.PROPS = collections.OrderedDict() self.REST = collections.OrderedDict() @@ -39,7 +40,9 @@ def extract_props(self): #gather property values + count = 0 for prop in self.PROPS.keys(): + count = count +1 s = URIRef(prop) self.PROPS[prop]['domain']= None self.PROPS[prop]['range']= None @@ -57,7 +60,7 @@ def extract_props(self): self.PROPS[prop]['e_prop'] = o for o in self.G.objects(subject=s, predicate=RDFS.label): - self.PROPS[prop]['label'] = o + self.PROPS[prop]['label'] = self.gen_shape_labels(prop)+str(count) @@ -75,11 +78,12 @@ def extract_classes(self): classes.append(s) else: pass - + count = 0 for c in sorted(classes): self.CLASSES[c] = {} - for c in self.CLASSES.keys(): - self.CLASSES[c]['label'] = self.gen_shape_labels(c) + for c in self.CLASSES.keys(): + count = count +1 + self.CLASSES[c]['label'] = self.gen_shape_labels(c)+str(count) def extract_restrictions(self): # does not handle nested restrictions within other class descriptions @@ -132,29 +136,30 @@ def gen_shape_labels(self, URI): label = URI.split("#")[-1] else: label = URI.split("/")[-1] - return label + return label+'_' - def gen_graph(self): + def gen_graph(self, serial='turtle'): + self.extract_props() self.extract_classes() self.extract_restrictions() ng = Graph() SH = Namespace('http://www.w3.org/ns/shacl#') - ng.bind('SH', SH) + ng.bind('sh', SH) EX = Namespace('http://www.example.org/') - ng.bind('EX', EX) + ng.bind('ex', EX) # add class Node Shapes for c in self.CLASSES.keys(): - label = self.gen_shape_labels(c)+'_ClassShape' + label = self.CLASSES[c]['label'] ng.add((EX[label], RDF.type, SH.NodeShape)) ng.add((EX[label], SH.targetClass, c)) for p in self.PROPS.keys(): if self.PROPS[p]['domain'] is not None: blank = BNode() if self.PROPS[p]['domain'] in self.CLASSES.keys(): - label = self.gen_shape_labels(self.PROPS[p]['domain'])+'_ClassShape' + label = self.CLASSES[self.PROPS[p]['domain']]['label'] ng.add((EX[label], SH.property, blank)) ng.add((blank, SH.path, p)) if self.PROPS[p]['range'] is not None: @@ -177,7 +182,7 @@ def gen_graph(self): else: pass else: - label = self.gen_shape_labels(self.PROPS[p])+'_PropShape' + label = self.PROPS[p]['label'] ng.add((EX[label], RDF.type, SH.NodeShape)) ng.add((EX[label], SH.targetSubjectsOf, p)) ng.add((EX[label], SH.nodeKind, SH.BlankNodeOrIRI)) @@ -191,7 +196,7 @@ def gen_graph(self): ng.add((blank, SH['class'], rang )) else: blank = BNode() - label = self.gen_shape_labels(p)+'_PropShape' + label = self.PROPS[p]['label'] ng.add((EX[label], RDF.type, SH.NodeShape)) ng.add((EX[label], SH.targetSubjectsOf, p)) ng.add((EX[label], SH.nodeKind, SH.BlankNodeOrIRI)) @@ -204,11 +209,8 @@ def gen_graph(self): else: ng.add((blank, SH['class'], rang )) - print(ng.serialize(format='turtle').decode()) - return ng + print(ng.serialize(format=serial).decode()) - def save_graph(self, path): - ng = self.gen_graph() - ng.serialize(path, format='turtle') \ No newline at end of file + \ No newline at end of file diff --git a/shaclgen/shaclgen.py b/shaclgen/shaclgen.py index cba3e2e..4524f22 100644 --- a/shaclgen/shaclgen.py +++ b/shaclgen/shaclgen.py @@ -1,168 +1,149 @@ #!/usr/bin/env python -from rdflib import Graph, Namespace, XSD, RDF, URIRef + +from rdflib import Graph, Namespace, URIRef, BNode import rdflib from collections import Counter +from rdflib.util import guess_format +import collections +from rdflib.namespace import XSD, RDF, OWL, RDFS -def generate_merged(input_URIS, serialization): - g = rdflib.Graph() - for x in input_URIS: - g.parse(x, format=serialization) - - class_list = [] - for s,p,o in g.triples( (None, RDF.type, None) ): - class_list.append(o) - class_list = sorted(list(set(class_list))) +class data_graph(): + def __init__(self, args:list): + self.G = rdflib.Graph() + + for graph in args: + self.G.parse(graph,format=guess_format(graph)) + + self.CLASSES = collections.OrderedDict() + self.PROPS = collections.OrderedDict() + self.OUT = [] + + + def extract_pairs(self): + + classes = [] + for s,p,o in self.G.triples( (None, RDF.type, None) ): + classes.append(o) + + classes = sorted(list(set(classes))) - tupes = [] - for k in class_list: - for s,p,o in g.triples((None, RDF.type, k)): - for s,p1,o1 in g.triples((s, None, None)): - tupes.append((k,p1)) - tupes = list(set(tupes)) + tupes = [] + count = 0 + for clas in classes: + count = count +1 + for s,p,o in self.G.triples((None, RDF.type, clas)): + for s,p1,o1 in self.G.triples((s, None, None)): + tupes.append((count,clas,p1)) + + tupes = list(set(tupes)) - tupes = [x for x in tupes if x[1] != rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')] + tupes = [x for x in tupes if x[2] != rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')] - c = Counter(x[1] for x in tupes) - output = [x + ('unique',) if c[x[1]] == 1 else x + ('repeat',) for x in tupes] - - return output + c = Counter(x[2] for x in tupes) + self.OUT = [x + ('unique',) if c[x[2]] == 1 else x + ('repeat',) for x in tupes] + print(self.OUT) + def gen_shape_labels(self, URI): + if '#' in URI: + label = URI.split("#")[-1] + else: + label = URI.split("/")[-1] + return label+'_' + def extract_classes(self): + classes = [] + for s,p,o in self.G.triples((None, RDF.type, None)): + classes.append(o) + for c in sorted(classes): + self.CLASSES[c] = {} + count = 0 + for clas in self.CLASSES.keys(): + count = count +1 + self.CLASSES[clas]['label'] = self.gen_shape_labels(clas)+str(count) -def generate_groups(input_URI, serialization): - if len(input_URI) > 1: - g = rdflib.Graph() - for x in input_URI: - g.parse(x, format=serialization) - else: - g = rdflib.Graph() - g.load(input_URI[0], format=serialization) - - class_list = [] - for s,p,o in g.triples( (None, RDF.type, None) ): - class_list.append(o) - class_list = sorted(list(set(class_list))) - - tupes = [] - for k in class_list: - for s,p,o in g.triples((None, RDF.type, k)): - for s,p1,o1 in g.triples((s, None, None)): - tupes.append((k,p1)) - tupes = list(set(tupes)) - - tupes = [x for x in tupes if x[1] != rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')] - c = Counter(x[1] for x in tupes) - output = [x + ('unique',) if c[x[1]] == 1 else x + ('repeat',) for x in tupes] - - return output + + def extract_props(self): + self.extract_classes() + props = [] + for predicate in self.G.predicates(object=None, subject=None): + props.append(predicate) + props = [x for x in props if x != rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')] + + for p in sorted(props): + self.PROPS[p] = {} + + count = 0 + for p in self.PROPS.keys(): + count = count +1 + self.PROPS[p]['classes']=[] + + self.PROPS[p]['label'] = self.gen_shape_labels(p)+str(count) + prop_classes = [] + for sub,pred,obj in self.G.triples((None, p, None)): + for sub, pred1, obj1 in self.G.triples( (sub, RDF.type, None) ): + prop_classes.append(obj1) -#generate the triples -def generate_triples(output, graph_format): - triples = '' - counter = 0 - if graph_format == 'nf': - for statement in output: - counter = counter + 1 - if '#' in statement[0]: - name = statement[0].split("#")[-1] - else: - name = statement[0].split("/")[-1] - #gen node shape - node_triples = (f""" - . - <{statement[0]}> .""") - if statement[2] == 'unique': - if '#' in statement[1]: - prop_name = statement[1].split("#")[-1] - else: - prop_name = statement[1].split("/")[-1] - prop_triples = (f""" - _:{counter} . - _:{counter} <{statement[1]}> . - _:{counter} "{prop_name}" . """) - else: - if '#' in statement[1]: - prop_name = statement[1].split("#")[-1] - else: - prop_name = statement[1].split("/")[-1] - prop_triples = (f""" - . - . - <{statement[1]}> .""") - triples = triples + node_triples + prop_triples - elif graph_format =='ef': - for statement in output: - counter = counter + 1 - if '#' in statement[0]: - name = statement[0].split("#")[-1] + uris = [] + + [uris.append(x) for x in prop_classes if x not in uris] + + for x in uris: + self.PROPS[p]['classes'].append(self.CLASSES[x]['label']) + + if len(self.PROPS[p]['classes']) == 1: + self.PROPS[p]['type'] = 'unique' + else: - name = statement[0].split("/")[-1] - #gen node shape - node_triples = (f""" - . - <{statement[0]}> .""") - if statement[2] == 'unique': - if '#' in statement[1]: - prop_name = statement[1].split("#")[-1] - else: - prop_name = statement[1].split("/")[-1] - prop_triples = (f""" - _:{counter} . - _:{counter} <{statement[1]}> . - _:{counter} "{prop_name}" . + self.PROPS[p]['type'] = 'repeat' - . - <{statement[1]}> .""") - else: - if '#' in statement[1]: - prop_name = statement[1].split("#")[-1] - else: - prop_name = statement[1].split("/")[-1] - prop_triples = (f""" - . - . - <{statement[1]}> .""") - triples = triples + node_triples + prop_triples - else: - for statement in output: - counter = counter + 1 - if '#' in statement[0]: - name = statement[0].split("#")[-1] - else: - name = statement[0].split("/")[-1] - #gen node shape - node_triples = (f""" - . - <{statement[0]}> .""") - if '#' in statement[1]: - prop_name = statement[1].split("#")[-1] - else: - prop_name = statement[1].split("/")[-1] - prop_triples = (f""" - . - . - <{statement[1]}> .""") - triples = triples + node_triples + prop_triples - return triples - - -#generate the graph -def generate_shacl(triples): - g = rdflib.Graph() - - ### bind namespaces to graph, adding in example for default shape namespaces and shacl for shacl props/classes. - #for key, uri in args: - # g.bind(key, URIRef(uri)) - g.bind('ex', URIRef('http://example.org/')) - g.bind('sh', URIRef('http://www.w3.org/ns/shacl#')) - - g.parse(data= triples, format='nt') - shapes = g.serialize(format='turtle') - shapes = shapes.decode("utf-8") - return shapes + def gen_graph(self, serial='turtle', graph_format=None): + self.extract_props() + + ng = rdflib.Graph() + + SH = Namespace('http://www.w3.org/ns/shacl#') + ng.bind('sh', SH) + EX = Namespace('http://www.example.org/') + ng.bind('ex', EX) + + + for c in self.CLASSES.keys(): + label = self.CLASSES[c]['label'] + ng.add((EX[label],RDF.type, SH.NodeShape )) + ng.add( (EX[label], SH.targetClass, c) ) + ng.add( (EX[label], SH.nodeKind, SH.BlankNodeOrIRI) ) + + for p in self.PROPS.keys(): + if graph_format == 'nf' or graph_format == 'ef': + + if self.PROPS[p]['type'] == 'unique': + blank = BNode() + ng.add( (EX[self.PROPS[p]['classes'][0]], SH.property, blank) ) + ng.add( (blank, SH.path, p)) + + if graph_format =='ef': + ng.add( (EX[self.PROPS[p]['label']], RDF.type, SH.PropertyShape) ) + ng.add( (EX[self.PROPS[p]['label']], SH.path, p) ) + else: + pass + + else: + ng.add( (EX[self.PROPS[p]['classes'][0]], SH.property, EX[self.PROPS[p]['label']]) ) + ng.add( (EX[self.PROPS[p]['label']], RDF.type, SH.PropertyShape) ) + ng.add( (EX[self.PROPS[p]['label']], SH.path, p) ) + + else: + ng.add( (EX[self.PROPS[p]['label']], RDF.type, SH.PropertyShape) ) + ng.add( (EX[self.PROPS[p]['label']], SH.path, p) ) + ng.add( (EX[self.PROPS[p]['classes'][0]], SH.property, EX[self.PROPS[p]['label']]) ) + ng.add( (EX[self.PROPS[p]['label']], RDF.type, SH.PropertyShape) ) + ng.add( (EX[self.PROPS[p]['label']], SH.path, p) ) + + print(ng.serialize(format=serial).decode()) +