From c14740a5ecf48e73cf0aadaee7aaae11498f17d5 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Tue, 6 Dec 2022 07:47:25 -0700 Subject: [PATCH 1/3] Tweak parser to support element names which coincide with XPath operators (notably div) --- pylib/uxml/uxpath/lexrules.py | 12 +++++++----- pylib/uxml/uxpath/parserules.py | 19 ++++++++++++------- pylib/uxml/version.py | 2 +- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/pylib/uxml/uxpath/lexrules.py b/pylib/uxml/uxpath/lexrules.py index c53b654..199a655 100644 --- a/pylib/uxml/uxpath/lexrules.py +++ b/pylib/uxml/uxpath/lexrules.py @@ -8,11 +8,12 @@ from ply.lex import TOKEN +# Note: element names can coincide with XPath operators (notably div) OPERATOR_NAMES = { - 'or': 'OR_OP', - 'and': 'AND_OP', - 'div': 'DIV_OP', - 'mod': 'MOD_OP', + 'or': 'LOGICAL_OP', + 'and': 'LOGICAL_OP', + 'div': 'DIVMOD_OP', + 'mod': 'DIVMOD_OP', } tokens = [ @@ -40,7 +41,7 @@ 'NODETEXTTEST', 'NAME', 'DOLLAR', - ] + list(OPERATOR_NAMES.values()) + ] + list(set(OPERATOR_NAMES.values())) t_PATH_SEP = r'/' t_ABBREV_PATH_SEP = r'//' @@ -80,6 +81,7 @@ def t_NAME(t): # Check for operators t.type = OPERATOR_NAMES.get(t.value, 'NAME') + # t.type = 'NAME' return t def t_LITERAL(t): diff --git a/pylib/uxml/uxpath/parserules.py b/pylib/uxml/uxpath/parserules.py index 89f449c..bec919d 100644 --- a/pylib/uxml/uxpath/parserules.py +++ b/pylib/uxml/uxpath/parserules.py @@ -9,12 +9,11 @@ from .lexrules import tokens precedence = ( - ('left', 'OR_OP'), - ('left', 'AND_OP'), + ('left', 'LOGICAL_OP'), ('left', 'EQUAL_OP'), ('left', 'REL_OP'), ('left', 'PLUS_OP', 'MINUS_OP'), - ('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'), + ('left', 'MULT_OP', 'DIVMOD_OP'), ('right', 'UMINUS_OP'), ('left', 'UNION_OP'), ) @@ -25,15 +24,13 @@ def p_expr_boolean(p): """ - Expr : Expr OR_OP Expr - | Expr AND_OP Expr + Expr : Expr LOGICAL_OP Expr | Expr EQUAL_OP Expr | Expr REL_OP Expr | Expr PLUS_OP Expr | Expr MINUS_OP Expr | Expr MULT_OP Expr - | Expr DIV_OP Expr - | Expr MOD_OP Expr + | Expr DIVMOD_OP Expr """ p[0] = ast.BinaryExpression(p[1], p[2], p[3]) @@ -215,9 +212,13 @@ def p_name_test_star(p): """ p[0] = ast.NameTest('*') +# Note: element names which coincide with XPath operators (notably div) +# will come through as operator tokens, hence the non-straightforward production def p_name_test_name(p): """ NameTest : NAME + | LOGICAL_OP + | DIVMOD_OP """ p[0] = ast.NameTest(p[1]) @@ -261,6 +262,8 @@ def p_predicate(p): def p_variable_reference(p): """ VariableReference : DOLLAR NAME + | DOLLAR LOGICAL_OP + | DOLLAR DIVMOD_OP """ p[0] = ast.VariableReference(p[2]) @@ -282,6 +285,8 @@ def p_number(p): def p_function_call(p): """ FunctionCall : NAME FormalArguments + | LOGICAL_OP FormalArguments + | DIVMOD_OP FormalArguments """ #Hacking around the ambiguity between node type test & function call if p[1] in ('node', 'text'): diff --git a/pylib/uxml/version.py b/pylib/uxml/version.py index 7c23e44..cbf1486 100644 --- a/pylib/uxml/version.py +++ b/pylib/uxml/version.py @@ -1,2 +1,2 @@ # http://legacy.python.org/dev/peps/pep-0440/ -version_info = ('3', '2', '1') +version_info = ('3', '2', '2') From 0eb3db847fd2ffe70d6a6f3b91bbee2e435b6abf Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Thu, 8 Dec 2022 11:17:45 -0700 Subject: [PATCH 2/3] Implement amara3.uxml.html5.parse_lax_xml and add --lax command line option to microx --- README.md | 9 ++++----- exec/microx | 18 +++++++++++++----- pylib/uxml/html5.py | 38 ++++++++++++++++++++++++++++++++----- pylib/uxml/uxpath/README.md | 2 +- 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 8c6294c..5ecc50d 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,8 @@ # amara3-xml -[MicroXML](http://www.w3.org/community/microxml/) component of Amara3 project, which contains a variety of data processing tools. - -Data processing library built on Python 3 and . This module adds the MicroXML support, and adaptation to classic XML. +[MicroXML](http://www.w3.org/community/microxml/) component of Amara3 project, which contains a variety of data processing tools. This module adds XML support based on the [MicroXML spec](https://dvcs.w3.org/hg/microxml/raw-file/tip/spec/microxml.html). [Uche Ogbuji](http://uche.ogbuji.net) < uche@ogbuji.net > -More discussion, etc: https://groups.google.com/forum/#!forum/akara ## Install @@ -17,7 +14,9 @@ pip install amara3-xml ## Use -Main focus is MicroXML, rather than full XML. However because most of the XML-like data you'll be dealing with is XML 1.0, Amara provides capabilities to parse legacy XML and reduce it to MicroXML. In many cases the biggest implication of this is that namespace information is stripped. As long as you know what you're doing you can get pretty far by ignoring this, but make sure you know what you're doing. +A good way to experiment with amara3-xml is the `microx` command line. + +Main focus is MicroXML, rather than full XML. However because most of the XML-like data you'll be dealing with is XML 1.0, Amara provides capabilities to parse legacy XML (and even HTML) and reduce it to MicroXML. In many cases the biggest implication of this is that namespace information is stripped. As long as you know what you're doing you can get pretty far by ignoring this, but make sure you know what you're doing. from amara3.uxml import xml diff --git a/exec/microx b/exec/microx index 5a45195..7b3e711 100755 --- a/exec/microx +++ b/exec/microx @@ -86,6 +86,8 @@ You can load XML or MicroXML from the Web rather than your file system $ microx --match=a --foreach="@href" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml +The --html flag allows parsing HTML, and the --lax flag is lenient, working with even non-well-formed XML + """ import re @@ -125,7 +127,7 @@ def xpath_to(node, show_attrs): def run(command_name, command_detail, sources=None, foreach=None, partition=None, - limit=None, out=None, parse_html=False, show_attrs=None, verbose=False): + limit=None, out=None, parse_html=False, parse_lax=False, show_attrs=None, verbose=False): ''' See the command line help ''' @@ -194,14 +196,16 @@ def run(command_name, command_detail, sources=None, foreach=None, partition=None for source in sources: if partition: ts = xmliter.sender(('**', partition), sink()) - sequencer = ts + # sequencer = ts ts.parse_file(source) else: if parse_html: root = html5.parse(source.read()) + elif parse_lax: + root = html5.parse_lax_xml(source.read()) else: - #FIXME: Implement incremental parsing, e.g. by requiring conversion to MicroXML first then using that parser + # FIXME: Implement incremental parsing, e.g. by requiring conversion to MicroXML first then using that parser root = P(source.read()) process_partition(root) @@ -233,6 +237,8 @@ if __name__ == '__main__': help='Show verbose error messages') parser.add_argument('--html', action='store_true', help='Parse input sources as HTML') + parser.add_argument('--lax', action='store_true', + help='Parse input sources in lax mode (e.g. lenient to non-well-formed XML)') parser.add_argument('--find-text', metavar="TEXT", help='List the various XPaths that lead to a node containing the specified text') parser.add_argument('--show-attrs', metavar="ATTRIB_NAMELIST", @@ -261,7 +267,9 @@ if __name__ == '__main__': show_attrs = args.show_attrs.split(',') command_name, command_detail = commands[0] - run(command_name, command_detail, sources=sources, foreach=args.foreach, partition=args.partition, - limit=args.limit, out=args.out, parse_html=args.html, show_attrs=show_attrs, verbose=args.verbose) + run(command_name, command_detail, sources=sources, foreach=args.foreach, + partition=args.partition, limit=args.limit, out=args.out, + parse_html=args.html, parse_lax=args.lax, show_attrs=show_attrs, + verbose=args.verbose) for f in sources: f.close() args.out.close() diff --git a/pylib/uxml/html5.py b/pylib/uxml/html5.py index eeef74a..41bf638 100644 --- a/pylib/uxml/html5.py +++ b/pylib/uxml/html5.py @@ -249,20 +249,48 @@ def parse(source, prefixes=None, model=None, encoding=None, use_xhtml_ns=False): >>> with urllib.request.urlopen('http://uche.ogbuji.net/') as response: ... html5.parse(response) - - #Warning: if you pass a string, you must make sure it's a byte string, not a Unicode object. You might also want to wrap it with amara.lib.inputsource.text if it's not obviously XML or HTML (for example it could be confused with a file name) + Warning: if you pass a string, make sure it's a byte string, not a Unicode object. + You might also want to wrap it with amara.lib.inputsource.text + if it's not obviously XML or HTML (to agvoid e.g. its getting confused + for a file name) ''' def get_tree_instance(namespaceHTMLElements, use_xhtml_ns=use_xhtml_ns): #use_xhtml_ns is a boolean, whether or not to use http://www.w3.org/1999/xhtml return treebuilder(use_xhtml_ns) parser = html5lib.HTMLParser(tree=get_tree_instance) - #doc = parser.parse(inputsource(source, None).stream, encoding=encoding) - #doc = parser.parse(source, encoding=encoding) + # doc = parser.parse(inputsource(source, None).stream, encoding=encoding) + # doc = parser.parse(source, encoding=encoding) doc = parser.parse(source) first_element = next((e for e in doc.root_nodes if isinstance(e, element)), None) return first_element +def parse_lax_xml(source, prefixes=None, model=None, encoding=None): + ''' + Parse an input XML-like source with HTML text into an Amara 3 tree + + >>> from amara3.uxml import html5 + >>> a_elem = html5.parse_lax_xml('Spam') + >>> a_elem.xml_encode() + 'Spam' + + Warning: if you pass a string, make sure it's a byte string, not a Unicode object. + You might also want to wrap it with amara.lib.inputsource.text + if it's not obviously XML or HTML (to agvoid e.g. its getting confused + for a file name) + Do not use this method to parse HTML, even tagsoup HTML. Use html5.parse instead + ''' + false_top = parse(source, prefixes=prefixes, model=model, encoding=encoding, + use_xhtml_ns=False) + top = false_top.xml_children[1].xml_children[0] + # Detach the bit we want from the wrapper + del false_top.xml_children[1].xml_children[0] + top._xml_parent = None + try: + return top + except IndexError: + raise ValueError('Unable to process input even as tag soup XML') + def markup_fragment(source, encoding=None): ''' Parse a fragment of markup in HTML mode, and return a tree node @@ -273,7 +301,7 @@ def markup_fragment(source, encoding=None): from amara.bindery import html doc = html.markup_fragment(inputsource.text('XXX

Spam!

Eggs!YYY')) - See also: http://wiki.xml3k.org/Amara2/Tagsoup + See also: http://wiki.xml3k.org/Amara2/Tagsoup [TODO: Page defunct - restore it] ''' doc = parse(source, encoding=encoding) frag = doc.html.body diff --git a/pylib/uxml/uxpath/README.md b/pylib/uxml/uxpath/README.md index e34f7a7..1b0f253 100644 --- a/pylib/uxml/uxpath/README.md +++ b/pylib/uxml/uxpath/README.md @@ -6,7 +6,7 @@ (For non-Python folks) -* Install [Python 3.4+ or later](https://www.python.org/downloads/) +* Install [Python 3.5+ or later](https://www.python.org/downloads/) * Download the [amara3-xml](https://pypi.python.org/pypi/amara3-xml) package and install (`python setup.py install`) (For Python folks, you should be using pip: `pip install amara3-xml`) From e992a095f44b911ef0c3fcb1f1a7e4ce97bd2b2a Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Wed, 14 Dec 2022 15:24:21 -0700 Subject: [PATCH 3/3] Implement amara3.uxml.tree.xml_remove(). Update test cases. Bump version. --- pylib/uxml/html5.py | 7 ++++--- pylib/uxml/tree.py | 14 ++++++++++++++ pylib/uxml/version.py | 2 +- setup.py | 2 +- test/uxml/test_html5.py | 9 +++++++++ test/uxml/test_tree.py | 15 +++++++++++++++ 6 files changed, 44 insertions(+), 5 deletions(-) diff --git a/pylib/uxml/html5.py b/pylib/uxml/html5.py index 41bf638..7fe4263 100644 --- a/pylib/uxml/html5.py +++ b/pylib/uxml/html5.py @@ -281,16 +281,17 @@ def parse_lax_xml(source, prefixes=None, model=None, encoding=None): Do not use this method to parse HTML, even tagsoup HTML. Use html5.parse instead ''' false_top = parse(source, prefixes=prefixes, model=model, encoding=encoding, - use_xhtml_ns=False) + use_xhtml_ns=False) top = false_top.xml_children[1].xml_children[0] # Detach the bit we want from the wrapper - del false_top.xml_children[1].xml_children[0] - top._xml_parent = None + false_top.xml_children[1].xml_remove(top) + del false_top # Ensure cleanup try: return top except IndexError: raise ValueError('Unable to process input even as tag soup XML') + def markup_fragment(source, encoding=None): ''' Parse a fragment of markup in HTML mode, and return a tree node diff --git a/pylib/uxml/tree.py b/pylib/uxml/tree.py index 88cb080..f2815f4 100644 --- a/pylib/uxml/tree.py +++ b/pylib/uxml/tree.py @@ -110,6 +110,20 @@ def xml_insert(self, child, index=-1): self.xml_children.insert(index, child) return + def xml_remove(self, child: node): + ''' + Remove a child element. Does not destroy the child element, which becomes + the new root of its own tree + + child - the child to remove + ''' + if child in self.xml_children: + child._xml_parent = None + self.xml_children.remove(child) + else: + raise ValueError(f'Element {self} has no child {child}') + return + def __repr__(self): return u'{{uxml.element ({0}) "{1}" with {2} children}}'.format(hash(self), self.xml_name, len(self.xml_children)) diff --git a/pylib/uxml/version.py b/pylib/uxml/version.py index cbf1486..1dd3b9c 100644 --- a/pylib/uxml/version.py +++ b/pylib/uxml/version.py @@ -1,2 +1,2 @@ # http://legacy.python.org/dev/peps/pep-0440/ -version_info = ('3', '2', '2') +version_info = ('3', '3', '0') diff --git a/setup.py b/setup.py index 8304352..372bdc1 100755 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ Reluctantly use setuptools for now to get install_requires & long_description_content_type $ python -c "import amara3; import amara3.iri; import amara3.uxml; import amara3.uxml.version; print(amara3.uxml.version.version_info)" -('3', '0', '1') +('3', '3', '0') ''' import sys diff --git a/test/uxml/test_html5.py b/test/uxml/test_html5.py index 7d108c9..a181b86 100644 --- a/test/uxml/test_html5.py +++ b/test/uxml/test_html5.py @@ -43,5 +43,14 @@ def test_xml_encode_with_comment(): assert root.xml_encode() == DOC2_NORMALIZED +DOC_NON_WF_XML = 'Spam' +DOC_NON_WF_XML_NORM = 'Spam' + +def test_non_wf_xml_parse(): + root = html5.parse_lax_xml(io.StringIO(DOC_NON_WF_XML)) + # Round trip + assert root.xml_encode() == DOC_NON_WF_XML_NORM + + if __name__ == '__main__': raise SystemExit("Run with py.test") diff --git a/test/uxml/test_tree.py b/test/uxml/test_tree.py index 610b276..15eee75 100644 --- a/test/uxml/test_tree.py +++ b/test/uxml/test_tree.py @@ -46,5 +46,20 @@ def test_basic_mutate(doc): #FIXME: More testing +def test_add_remove(): + # FIXME: Get the tree from fixture + tb = tree.treebuilder() + root = tb.parse(DOC1) + assert len(root.xml_children) == 3 + new_elem_1 = element('dee', {'a': '1'}) + root.xml_append(new_elem_1) + assert len(root.xml_children) == 4 + root.xml_remove(new_elem_1) + assert len(root.xml_children) == 3 + # Removal should be non-destructive of the removed child element + assert new_elem_1.xml_encode() == '' + #FIXME: More testing + + if __name__ == '__main__': raise SystemExit("Run with py.test")