Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add limited XML create-template support #215

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion examples/help/create-template/expected.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
usage: flatten-tool create-template [-h] -s SCHEMA [-f {csv,xlsx,all}]
usage: flatten-tool create-template [-h] [-s SCHEMA] [-f {csv,xlsx,all}]
[-m MAIN_SHEET_NAME] [-o OUTPUT_NAME]
[--rollup] [-r ROOT_ID] [--use-titles]
[--xml]
[--xml-schema [XML_SCHEMA [XML_SCHEMA ...]]]
[--root-list-path ROOT_LIST_PATH]

optional arguments:
-h, --help show this help message and exit
Expand All @@ -22,3 +25,9 @@ optional arguments:
-r ROOT_ID, --root-id ROOT_ID
Root ID of the data format, e.g. ocid for OCDS
--use-titles Convert titles. Requires a schema to be specified.
--xml Use XML as the input format
--xml-schema [XML_SCHEMA [XML_SCHEMA ...]]
Path to one or more XML schemas
--root-list-path ROOT_LIST_PATH
Path of the root list, defaults to main. Needed for
XML template creation only.
11 changes: 8 additions & 3 deletions flattentool/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,28 @@
from flattentool.input import FORMATS as INPUT_FORMATS
from flattentool.xml_output import toxml
from flattentool.lib import parse_sheet_configuration
from flattentool.xml_create_template import XMLSchemaParser
import sys
import json
import codecs
from decimal import Decimal
from collections import OrderedDict


def create_template(schema, output_name='template', output_format='all', main_sheet_name='main',
rollup=False, root_id=None, use_titles=False, **_):
def create_template(schema=None, output_name='template', output_format='all', main_sheet_name='main',
rollup=False, root_id=None, use_titles=False,
xml=False, xml_schemas=None, root_list_path=None, **_):
"""
Creates template file(s) from given inputs
This function is built to deal with commandline input and arguments
but to also be called from elswhere in future

"""

parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles)
if xml:
parser = XMLSchemaParser(xml_schemas=xml_schemas, root_list_path=root_list_path)
else:
parser = SchemaParser(schema_filename=schema, rollup=rollup, root_id=root_id, use_titles=use_titles)
parser.parse()

def spreadsheet_output(spreadsheet_output_class, name):
Expand Down
19 changes: 16 additions & 3 deletions flattentool/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ def create_parser():
parser_create_template = subparsers.add_parser(
'create-template',
help='Create a template from the given schema')
parser_create_template.add_argument(
schema_group = parser_create_template.add_mutually_exclusive_group(required=True)
schema_group.add_argument(
"-s", "--schema",
help="Path to the schema file you want to use to create the template",
required=True)
help="Path to the schema file you want to use to create the template")
parser_create_template.add_argument(
"-f", "--output-format",
help="Type of template you want to create. Defaults to all available options",
Expand All @@ -61,6 +61,19 @@ def create_parser():
"--use-titles",
action='store_true',
help="Convert titles. Requires a schema to be specified.")
parser_create_template.add_argument(
"--xml",
action='store_true',
help="Use XML as the input format")
schema_group.add_argument(
"--xml-schema",
dest='xml_schemas',
metavar='XML_SCHEMA',
nargs='*',
help="Path to one or more XML schemas")
parser_create_template.add_argument(
"--root-list-path",
help="Path of the root list, defaults to main. Needed for XML template creation only.")

parser_flatten = subparsers.add_parser(
'flatten',
Expand Down
10 changes: 4 additions & 6 deletions flattentool/sort_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def get_schema_element(self, tag_name, name_attribute):
return schema_element
return schema_element

def element_loop(self, element, path):
def element_loop(self, element):
"""
Return information about the children of the supplied element.
"""
Expand All @@ -95,14 +95,12 @@ def element_loop(self, element, path):
'xsd:complexType/xsd:all/xsd:element',
namespaces=namespaces)
+ type_elements)
child_tuples = []
for child in children:
a = child.attrib
if 'name' in a:
child_tuples.append((a['name'], child, None, a.get('minOccurs'), a.get('maxOccurs')))
yield a['name'], child, None, a.get('minOccurs'), a.get('maxOccurs')
else:
child_tuples.append((a['ref'], None, child, a.get('minOccurs'), a.get('maxOccurs')))
return child_tuples
yield a['ref'], None, child, a.get('minOccurs'), a.get('maxOccurs')

def create_schema_dict(self, parent_name, parent_element=None):
"""
Expand All @@ -114,7 +112,7 @@ def create_schema_dict(self, parent_name, parent_element=None):

return OrderedDict([
(name, self.create_schema_dict(name, element))
for name, element, _, _, _ in self.element_loop(parent_element, '')])
for name, element, _, _, _ in self.element_loop(parent_element)])


def sort_element(element, schema_subdict):
Expand Down
109 changes: 109 additions & 0 deletions flattentool/xml_create_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import sys

from .sort_xml import XMLSchemaWalker, namespaces
from .sheet import Sheet


class XMLSchemaWalkerForTemplate(XMLSchemaWalker):
def attribute_loop(self, element):
"""
Returns a list containing a tuple for each attribute the given element
can have.
The format of the tuple is (name, is_required)
"""
#if element.find("xsd:complexType[@mixed='true']", namespaces=namespaces) is not None:
# print_column_info('text', indent)

a = element.attrib
type_attributes = []
type_attributeGroups = []
if 'type' in a:
complexType = self.get_schema_element('complexType', a['type'])
if complexType is not None:
type_attributes = (
complexType.findall('xsd:attribute', namespaces=namespaces) +
complexType.findall('xsd:simpleContent/xsd:extension/xsd:attribute', namespaces=namespaces)
)
type_attributeGroups = (
complexType.findall('xsd:attributeGroup', namespaces=namespaces) +
complexType.findall('xsd:simpleContent/xsd:extension/xsd:attributeGroup', namespaces=namespaces)
)

group_attributes = []
for attributeGroup in (
element.findall('xsd:complexType/xsd:attributeGroup', namespaces=namespaces) +
element.findall('xsd:complexType/xsd:simpleContent/xsd:extension/xsd:attributeGroup', namespaces=namespaces) +
type_attributeGroups
):
group_attributes += self.get_schema_element('attributeGroup', attributeGroup.attrib['ref']).findall('xsd:attribute', namespaces=namespaces)

for attribute in (
element.findall('xsd:complexType/xsd:attribute', namespaces=namespaces) +
element.findall('xsd:complexType/xsd:simpleContent/xsd:extension/xsd:attribute', namespaces=namespaces) +
type_attributes + group_attributes
):
doc = attribute.find(".//xsd:documentation", namespaces=namespaces)
if 'ref' in attribute.attrib:
referenced_attribute = self.get_schema_element('attribute', attribute.get('ref'))
if referenced_attribute is not None:
attribute = referenced_attribute
if doc is None:
# Only fetch the documentation of the referenced definition
# if we don't already have documentation.
doc = attribute.find(".//xsd:documentation", namespaces=namespaces)
yield attribute.get('name') or attribute.get('ref'), attribute.get('use') == 'required'

def has_simple_content(self, element):
a = element.attrib
simple_content = False
# we look up the type, and that has a simpleContent child
if 'type' in a:
complexType = self.get_schema_element('complexType', a['type'])
if complexType is not None:
simple_content = bool(complexType.findall('xsd:simpleContent', namespaces=namespaces))
# or the compleType element here has a simpleContent child
simple_content = simple_content or bool(element.findall('xsd:complexType/xsd:simpleContent', namespaces=namespaces))
# or there is only an annotation element
simple_content = simple_content or [child.tag for child in element] == ['{http://www.w3.org/2001/XMLSchema}annotation']
return simple_content

def generate_paths(self, parent_name, parent_element=None, parent_path=''):
if parent_element is None:
parent_element = self.get_schema_element('element', parent_name)

for name, required, in self.attribute_loop(parent_element):
if name == 'xml:lang':
# Namespaces not supported yet https://github.com/OpenDataServices/flatten-tool/issues/148
# And no way to specify two narrative elements anyway https://github.com/OpenDataServices/cove/issues/777
continue
yield parent_path + '@' + name

for name, element, _, minOccurs, maxOccurs in self.element_loop(parent_element):
if element is None:
element = self.get_schema_element('element', name)
path = parent_path + name
if self.has_simple_content(element):
yield path
if maxOccurs == 'unbounded' or int(maxOccurs) > 1:
path += '/0/'
else:
path += '/'
for child_path in self.generate_paths(name, element, path):
yield child_path


class XMLSchemaParser(object):
"""Parse the fields of a JSON schema into a flattened structure."""

def __init__(self, xml_schemas=[], root_list_path=None):
self.sub_sheets = {}
self.main_sheet = Sheet()
self.sub_sheet_mapping = {}
self.xml_schemas = xml_schemas
assert root_list_path is not None
self.root_list_path = root_list_path

def parse(self):
for path in XMLSchemaWalkerForTemplate(self.xml_schemas).generate_paths(self.root_list_path):
self.main_sheet.append(path)