Merge pull request #24 from uogbuji/develop

3.3.0 Release
uogbuji · Dec 19, 2022 · 73f03f0 · 73f03f0
2 parents 1093635 + e992a09
commit 73f03f0
Show file tree

Hide file tree

Showing 11 changed files with 111 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,8 @@
 # amara3-xml
 
-[MicroXML](http://www.w3.org/community/microxml/) component of Amara3 project, which contains a variety of data processing tools.
-
-Data processing library built on Python 3 and . This module adds the MicroXML support, and adaptation to classic XML.
+[MicroXML](http://www.w3.org/community/microxml/) component of Amara3 project, which contains a variety of data processing tools. This module adds XML support based on the [MicroXML spec](https://dvcs.w3.org/hg/microxml/raw-file/tip/spec/microxml.html).
 
 [Uche Ogbuji](http://uche.ogbuji.net) < [email protected] >
-More discussion, etc: https://groups.google.com/forum/#!forum/akara
 
 ## Install
 
@@ -17,7 +14,9 @@ pip install amara3-xml
 
 ## Use
 
-Main focus is MicroXML, rather than full XML. However because most of the XML-like data you'll be dealing with is XML 1.0, Amara provides capabilities to parse legacy XML and reduce it to MicroXML. In many cases the biggest implication of this is that namespace information is stripped. As long as you know what you're doing you can get pretty far by ignoring this, but make sure you know what you're doing.
+A good way to experiment with amara3-xml is the `microx` command line.
+
+Main focus is MicroXML, rather than full XML. However because most of the XML-like data you'll be dealing with is XML 1.0, Amara provides capabilities to parse legacy XML (and even HTML) and reduce it to MicroXML. In many cases the biggest implication of this is that namespace information is stripped. As long as you know what you're doing you can get pretty far by ignoring this, but make sure you know what you're doing.
 
     from amara3.uxml import xml
 

diff --git a/exec/microx b/exec/microx
@@ -86,6 +86,8 @@ You can load XML or MicroXML from the Web rather than your file system
 
 $ microx --match=a --foreach="@href" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml
 
+The --html flag allows parsing HTML, and the --lax flag is lenient, working with even non-well-formed XML
+
 """
 
 import re
@@ -125,7 +127,7 @@ def xpath_to(node, show_attrs):
 
 
 def run(command_name, command_detail, sources=None, foreach=None, partition=None,
-        limit=None, out=None, parse_html=False, show_attrs=None, verbose=False):
+        limit=None, out=None, parse_html=False, parse_lax=False, show_attrs=None, verbose=False):
     '''
     See the command line help
     '''
@@ -194,14 +196,16 @@ def run(command_name, command_detail, sources=None, foreach=None, partition=None
     for source in sources:
         if partition:
             ts = xmliter.sender(('**', partition), sink())
-            sequencer = ts
+            # sequencer = ts
             ts.parse_file(source)
 
         else:
             if parse_html:
                 root = html5.parse(source.read())
+            elif parse_lax:
+                root = html5.parse_lax_xml(source.read())
             else:
-                #FIXME: Implement incremental parsing, e.g. by requiring conversion to MicroXML first then using that parser
+                # FIXME: Implement incremental parsing, e.g. by requiring conversion to MicroXML first then using that parser
                 root = P(source.read())
             process_partition(root)
 
@@ -233,6 +237,8 @@ if __name__ == '__main__':
         help='Show verbose error messages')
     parser.add_argument('--html', action='store_true',
         help='Parse input sources as HTML')
+    parser.add_argument('--lax', action='store_true',
+        help='Parse input sources in lax mode (e.g. lenient to non-well-formed XML)')
     parser.add_argument('--find-text', metavar="TEXT",
         help='List the various XPaths that lead to a node containing the specified text')
     parser.add_argument('--show-attrs', metavar="ATTRIB_NAMELIST",
@@ -261,7 +267,9 @@ if __name__ == '__main__':
         show_attrs = args.show_attrs.split(',')
 
     command_name, command_detail = commands[0]
-    run(command_name, command_detail, sources=sources, foreach=args.foreach, partition=args.partition,
-        limit=args.limit, out=args.out, parse_html=args.html, show_attrs=show_attrs, verbose=args.verbose)
+    run(command_name, command_detail, sources=sources, foreach=args.foreach,
+        partition=args.partition, limit=args.limit, out=args.out,
+        parse_html=args.html, parse_lax=args.lax, show_attrs=show_attrs,
+        verbose=args.verbose)
     for f in sources: f.close()
     args.out.close()
diff --git a/pylib/uxml/html5.py b/pylib/uxml/html5.py
@@ -249,20 +249,49 @@ def parse(source, prefixes=None, model=None, encoding=None, use_xhtml_ns=False):
     >>> with urllib.request.urlopen('http://uche.ogbuji.net/') as response:
     ...     html5.parse(response)
 
-
-    #Warning: if you pass a string, you must make sure it's a byte string, not a Unicode object.  You might also want to wrap it with amara.lib.inputsource.text if it's not obviously XML or HTML (for example it could be confused with a file name)
+    Warning: if you pass a string, make sure it's a byte string, not a Unicode object.
+    You might also want to wrap it with amara.lib.inputsource.text
+    if it's not obviously XML or HTML (to agvoid e.g. its getting confused
+    for a file name)
     '''
     def get_tree_instance(namespaceHTMLElements, use_xhtml_ns=use_xhtml_ns):
         #use_xhtml_ns is a boolean, whether or not to use http://www.w3.org/1999/xhtml
         return treebuilder(use_xhtml_ns)
     parser = html5lib.HTMLParser(tree=get_tree_instance)
-    #doc = parser.parse(inputsource(source, None).stream, encoding=encoding)
-    #doc = parser.parse(source, encoding=encoding)
+    # doc = parser.parse(inputsource(source, None).stream, encoding=encoding)
+    # doc = parser.parse(source, encoding=encoding)
     doc = parser.parse(source)
     first_element = next((e for e in doc.root_nodes if isinstance(e, element)), None)
     return first_element
 
 
+def parse_lax_xml(source, prefixes=None, model=None, encoding=None):
+    '''
+    Parse an input XML-like source with HTML text into an Amara 3 tree
+
+    >>> from amara3.uxml import html5
+    >>> a_elem = html5.parse_lax_xml('<a><b>Spam</b>')
+    >>> a_elem.xml_encode()
+    '<a><b>Spam</b></a>'
+
+    Warning: if you pass a string, make sure it's a byte string, not a Unicode object.
+    You might also want to wrap it with amara.lib.inputsource.text
+    if it's not obviously XML or HTML (to agvoid e.g. its getting confused
+    for a file name)
+    Do not use this method to parse HTML, even tagsoup HTML. Use html5.parse instead
+    '''
+    false_top = parse(source, prefixes=prefixes, model=model, encoding=encoding,
+                        use_xhtml_ns=False)
+    top = false_top.xml_children[1].xml_children[0]
+    # Detach the bit we want from the wrapper
+    false_top.xml_children[1].xml_remove(top)
+    del false_top  # Ensure cleanup
+    try:
+        return top
+    except IndexError:
+        raise ValueError('Unable to process input even as tag soup XML')
+
+
 def markup_fragment(source, encoding=None):
     '''
     Parse a fragment of markup in HTML mode, and return a tree node
@@ -273,7 +302,7 @@ def markup_fragment(source, encoding=None):
     from amara.bindery import html
     doc = html.markup_fragment(inputsource.text('XXX<html><body onload="" color="white"><p>Spam!<p>Eggs!</body></html>YYY'))
 
-    See also: http://wiki.xml3k.org/Amara2/Tagsoup
+    See also: http://wiki.xml3k.org/Amara2/Tagsoup [TODO: Page defunct - restore it]
     '''
     doc = parse(source, encoding=encoding)
     frag = doc.html.body

diff --git a/pylib/uxml/tree.py b/pylib/uxml/tree.py
@@ -110,6 +110,20 @@ def xml_insert(self, child, index=-1):
             self.xml_children.insert(index, child)
         return
 
+    def xml_remove(self, child: node):
+        '''
+        Remove a child element. Does not destroy the child element, which becomes
+        the new root of its own tree
+
+        child - the child to remove
+        '''
+        if child in self.xml_children:
+            child._xml_parent = None
+            self.xml_children.remove(child)
+        else:
+            raise ValueError(f'Element {self} has no child {child}')
+        return
+
     def __repr__(self):
         return u'{{uxml.element ({0}) "{1}" with {2} children}}'.format(hash(self), self.xml_name, len(self.xml_children))
 

diff --git a/pylib/uxml/uxpath/README.md b/pylib/uxml/uxpath/README.md
@@ -6,7 +6,7 @@
 
 (For non-Python folks)
 
-* Install [Python 3.4+ or later](https://www.python.org/downloads/)
+* Install [Python 3.5+ or later](https://www.python.org/downloads/)
 * Download the [amara3-xml](https://pypi.python.org/pypi/amara3-xml) package and install (`python setup.py install`)
 
 (For Python folks, you should be using pip: `pip install amara3-xml`)

diff --git a/pylib/uxml/uxpath/lexrules.py b/pylib/uxml/uxpath/lexrules.py
@@ -8,11 +8,12 @@
 from ply.lex import TOKEN
 
 
+# Note: element names can coincide with XPath operators (notably div)
 OPERATOR_NAMES = {
-    'or': 'OR_OP',
-    'and': 'AND_OP',
-    'div': 'DIV_OP',
-    'mod': 'MOD_OP',
+    'or': 'LOGICAL_OP',
+    'and': 'LOGICAL_OP',
+    'div': 'DIVMOD_OP',
+    'mod': 'DIVMOD_OP',
 }
 
 tokens = [
@@ -40,7 +41,7 @@
         'NODETEXTTEST',
         'NAME',
         'DOLLAR',
-    ] + list(OPERATOR_NAMES.values())
+    ]   + list(set(OPERATOR_NAMES.values()))
 
 t_PATH_SEP = r'/'
 t_ABBREV_PATH_SEP = r'//'
@@ -80,6 +81,7 @@
 def t_NAME(t):
     # Check for operators
     t.type = OPERATOR_NAMES.get(t.value, 'NAME')
+    # t.type = 'NAME'
     return t
 
 def t_LITERAL(t):

diff --git a/pylib/uxml/uxpath/parserules.py b/pylib/uxml/uxpath/parserules.py
@@ -9,12 +9,11 @@
 from .lexrules import tokens
 
 precedence = (
-    ('left', 'OR_OP'),
-    ('left', 'AND_OP'),
+    ('left', 'LOGICAL_OP'),
     ('left', 'EQUAL_OP'),
     ('left', 'REL_OP'),
     ('left', 'PLUS_OP', 'MINUS_OP'),
-    ('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'),
+    ('left', 'MULT_OP', 'DIVMOD_OP'),
     ('right', 'UMINUS_OP'),
     ('left', 'UNION_OP'),
 )
@@ -25,15 +24,13 @@
 
 def p_expr_boolean(p):
     """
-    Expr : Expr OR_OP Expr
-         | Expr AND_OP Expr
+    Expr : Expr LOGICAL_OP Expr
          | Expr EQUAL_OP Expr
          | Expr REL_OP Expr
          | Expr PLUS_OP Expr
          | Expr MINUS_OP Expr
          | Expr MULT_OP Expr
-         | Expr DIV_OP Expr
-         | Expr MOD_OP Expr
+         | Expr DIVMOD_OP Expr
     """
     p[0] = ast.BinaryExpression(p[1], p[2], p[3])
 
@@ -215,9 +212,13 @@ def p_name_test_star(p):
     """
     p[0] = ast.NameTest('*')
 
+# Note: element names which coincide with XPath operators (notably div)
+# will come through as operator tokens, hence the non-straightforward production
 def p_name_test_name(p):
     """
     NameTest : NAME
+             | LOGICAL_OP
+             | DIVMOD_OP
     """
     p[0] = ast.NameTest(p[1])
 
@@ -261,6 +262,8 @@ def p_predicate(p):
 def p_variable_reference(p):
     """
     VariableReference : DOLLAR NAME
+                      | DOLLAR LOGICAL_OP
+                      | DOLLAR DIVMOD_OP
     """
     p[0] = ast.VariableReference(p[2])
 
@@ -282,6 +285,8 @@ def p_number(p):
 def p_function_call(p):
     """
     FunctionCall : NAME FormalArguments
+                 | LOGICAL_OP FormalArguments
+                 | DIVMOD_OP FormalArguments
     """
     #Hacking around the ambiguity between node type test & function call
     if p[1] in ('node', 'text'):

diff --git a/pylib/uxml/version.py b/pylib/uxml/version.py
@@ -1,2 +1,2 @@
 # http://legacy.python.org/dev/peps/pep-0440/
-version_info = ('3', '2', '1')
+version_info = ('3', '3', '0')
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 Reluctantly use setuptools for now to get install_requires & long_description_content_type
 
 $ python -c "import amara3; import amara3.iri; import amara3.uxml; import amara3.uxml.version; print(amara3.uxml.version.version_info)"
-('3', '0', '1')
+('3', '3', '0')
 '''
 
 import sys

diff --git a/test/uxml/test_html5.py b/test/uxml/test_html5.py
@@ -43,5 +43,14 @@ def test_xml_encode_with_comment():
     assert root.xml_encode() == DOC2_NORMALIZED
 
 
+DOC_NON_WF_XML = '<a x=1><b>Spam</b>'
+DOC_NON_WF_XML_NORM = '<a x="1"><b>Spam</b></a>'
+
+def test_non_wf_xml_parse():
+    root = html5.parse_lax_xml(io.StringIO(DOC_NON_WF_XML))
+    # Round trip
+    assert root.xml_encode() == DOC_NON_WF_XML_NORM
+
+
 if __name__ == '__main__':
     raise SystemExit("Run with py.test")
diff --git a/test/uxml/test_tree.py b/test/uxml/test_tree.py
@@ -46,5 +46,20 @@ def test_basic_mutate(doc):
     #FIXME: More testing
 
 
+def test_add_remove():
+    # FIXME: Get the tree from fixture
+    tb = tree.treebuilder()
+    root = tb.parse(DOC1)
+    assert len(root.xml_children) == 3
+    new_elem_1 = element('dee', {'a': '1'})
+    root.xml_append(new_elem_1)
+    assert len(root.xml_children) == 4
+    root.xml_remove(new_elem_1)
+    assert len(root.xml_children) == 3
+    # Removal should be non-destructive of the removed child element
+    assert new_elem_1.xml_encode() == '<dee a="1"></dee>'
+    #FIXME: More testing
+
+
 if __name__ == '__main__':
     raise SystemExit("Run with py.test")