Skip to content

Commit

Permalink
Merge pull request #24 from uogbuji/develop
Browse files Browse the repository at this point in the history
3.3.0 Release
  • Loading branch information
uogbuji authored Dec 19, 2022
2 parents 1093635 + e992a09 commit 73f03f0
Show file tree
Hide file tree
Showing 11 changed files with 111 additions and 30 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
# amara3-xml

[MicroXML](http://www.w3.org/community/microxml/) component of Amara3 project, which contains a variety of data processing tools.

Data processing library built on Python 3 and . This module adds the MicroXML support, and adaptation to classic XML.
[MicroXML](http://www.w3.org/community/microxml/) component of Amara3 project, which contains a variety of data processing tools. This module adds XML support based on the [MicroXML spec](https://dvcs.w3.org/hg/microxml/raw-file/tip/spec/microxml.html).

[Uche Ogbuji](http://uche.ogbuji.net) < [email protected] >
More discussion, etc: https://groups.google.com/forum/#!forum/akara

## Install

Expand All @@ -17,7 +14,9 @@ pip install amara3-xml

## Use

Main focus is MicroXML, rather than full XML. However because most of the XML-like data you'll be dealing with is XML 1.0, Amara provides capabilities to parse legacy XML and reduce it to MicroXML. In many cases the biggest implication of this is that namespace information is stripped. As long as you know what you're doing you can get pretty far by ignoring this, but make sure you know what you're doing.
A good way to experiment with amara3-xml is the `microx` command line.

Main focus is MicroXML, rather than full XML. However because most of the XML-like data you'll be dealing with is XML 1.0, Amara provides capabilities to parse legacy XML (and even HTML) and reduce it to MicroXML. In many cases the biggest implication of this is that namespace information is stripped. As long as you know what you're doing you can get pretty far by ignoring this, but make sure you know what you're doing.

from amara3.uxml import xml

Expand Down
18 changes: 13 additions & 5 deletions exec/microx
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ You can load XML or MicroXML from the Web rather than your file system
$ microx --match=a --foreach="@href" http://www.w3.org/2000/07/8378/xhtml/media-types/test4.xhtml
The --html flag allows parsing HTML, and the --lax flag is lenient, working with even non-well-formed XML
"""

import re
Expand Down Expand Up @@ -125,7 +127,7 @@ def xpath_to(node, show_attrs):


def run(command_name, command_detail, sources=None, foreach=None, partition=None,
limit=None, out=None, parse_html=False, show_attrs=None, verbose=False):
limit=None, out=None, parse_html=False, parse_lax=False, show_attrs=None, verbose=False):
'''
See the command line help
'''
Expand Down Expand Up @@ -194,14 +196,16 @@ def run(command_name, command_detail, sources=None, foreach=None, partition=None
for source in sources:
if partition:
ts = xmliter.sender(('**', partition), sink())
sequencer = ts
# sequencer = ts
ts.parse_file(source)

else:
if parse_html:
root = html5.parse(source.read())
elif parse_lax:
root = html5.parse_lax_xml(source.read())
else:
#FIXME: Implement incremental parsing, e.g. by requiring conversion to MicroXML first then using that parser
# FIXME: Implement incremental parsing, e.g. by requiring conversion to MicroXML first then using that parser
root = P(source.read())
process_partition(root)

Expand Down Expand Up @@ -233,6 +237,8 @@ if __name__ == '__main__':
help='Show verbose error messages')
parser.add_argument('--html', action='store_true',
help='Parse input sources as HTML')
parser.add_argument('--lax', action='store_true',
help='Parse input sources in lax mode (e.g. lenient to non-well-formed XML)')
parser.add_argument('--find-text', metavar="TEXT",
help='List the various XPaths that lead to a node containing the specified text')
parser.add_argument('--show-attrs', metavar="ATTRIB_NAMELIST",
Expand Down Expand Up @@ -261,7 +267,9 @@ if __name__ == '__main__':
show_attrs = args.show_attrs.split(',')

command_name, command_detail = commands[0]
run(command_name, command_detail, sources=sources, foreach=args.foreach, partition=args.partition,
limit=args.limit, out=args.out, parse_html=args.html, show_attrs=show_attrs, verbose=args.verbose)
run(command_name, command_detail, sources=sources, foreach=args.foreach,
partition=args.partition, limit=args.limit, out=args.out,
parse_html=args.html, parse_lax=args.lax, show_attrs=show_attrs,
verbose=args.verbose)
for f in sources: f.close()
args.out.close()
39 changes: 34 additions & 5 deletions pylib/uxml/html5.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,20 +249,49 @@ def parse(source, prefixes=None, model=None, encoding=None, use_xhtml_ns=False):
>>> with urllib.request.urlopen('http://uche.ogbuji.net/') as response:
... html5.parse(response)
#Warning: if you pass a string, you must make sure it's a byte string, not a Unicode object. You might also want to wrap it with amara.lib.inputsource.text if it's not obviously XML or HTML (for example it could be confused with a file name)
Warning: if you pass a string, make sure it's a byte string, not a Unicode object.
You might also want to wrap it with amara.lib.inputsource.text
if it's not obviously XML or HTML (to agvoid e.g. its getting confused
for a file name)
'''
def get_tree_instance(namespaceHTMLElements, use_xhtml_ns=use_xhtml_ns):
#use_xhtml_ns is a boolean, whether or not to use http://www.w3.org/1999/xhtml
return treebuilder(use_xhtml_ns)
parser = html5lib.HTMLParser(tree=get_tree_instance)
#doc = parser.parse(inputsource(source, None).stream, encoding=encoding)
#doc = parser.parse(source, encoding=encoding)
# doc = parser.parse(inputsource(source, None).stream, encoding=encoding)
# doc = parser.parse(source, encoding=encoding)
doc = parser.parse(source)
first_element = next((e for e in doc.root_nodes if isinstance(e, element)), None)
return first_element


def parse_lax_xml(source, prefixes=None, model=None, encoding=None):
'''
Parse an input XML-like source with HTML text into an Amara 3 tree
>>> from amara3.uxml import html5
>>> a_elem = html5.parse_lax_xml('<a><b>Spam</b>')
>>> a_elem.xml_encode()
'<a><b>Spam</b></a>'
Warning: if you pass a string, make sure it's a byte string, not a Unicode object.
You might also want to wrap it with amara.lib.inputsource.text
if it's not obviously XML or HTML (to agvoid e.g. its getting confused
for a file name)
Do not use this method to parse HTML, even tagsoup HTML. Use html5.parse instead
'''
false_top = parse(source, prefixes=prefixes, model=model, encoding=encoding,
use_xhtml_ns=False)
top = false_top.xml_children[1].xml_children[0]
# Detach the bit we want from the wrapper
false_top.xml_children[1].xml_remove(top)
del false_top # Ensure cleanup
try:
return top
except IndexError:
raise ValueError('Unable to process input even as tag soup XML')


def markup_fragment(source, encoding=None):
'''
Parse a fragment of markup in HTML mode, and return a tree node
Expand All @@ -273,7 +302,7 @@ def markup_fragment(source, encoding=None):
from amara.bindery import html
doc = html.markup_fragment(inputsource.text('XXX<html><body onload="" color="white"><p>Spam!<p>Eggs!</body></html>YYY'))
See also: http://wiki.xml3k.org/Amara2/Tagsoup
See also: http://wiki.xml3k.org/Amara2/Tagsoup [TODO: Page defunct - restore it]
'''
doc = parse(source, encoding=encoding)
frag = doc.html.body
Expand Down
14 changes: 14 additions & 0 deletions pylib/uxml/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,20 @@ def xml_insert(self, child, index=-1):
self.xml_children.insert(index, child)
return

def xml_remove(self, child: node):
'''
Remove a child element. Does not destroy the child element, which becomes
the new root of its own tree
child - the child to remove
'''
if child in self.xml_children:
child._xml_parent = None
self.xml_children.remove(child)
else:
raise ValueError(f'Element {self} has no child {child}')
return

def __repr__(self):
return u'{{uxml.element ({0}) "{1}" with {2} children}}'.format(hash(self), self.xml_name, len(self.xml_children))

Expand Down
2 changes: 1 addition & 1 deletion pylib/uxml/uxpath/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

(For non-Python folks)

* Install [Python 3.4+ or later](https://www.python.org/downloads/)
* Install [Python 3.5+ or later](https://www.python.org/downloads/)
* Download the [amara3-xml](https://pypi.python.org/pypi/amara3-xml) package and install (`python setup.py install`)

(For Python folks, you should be using pip: `pip install amara3-xml`)
Expand Down
12 changes: 7 additions & 5 deletions pylib/uxml/uxpath/lexrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from ply.lex import TOKEN


# Note: element names can coincide with XPath operators (notably div)
OPERATOR_NAMES = {
'or': 'OR_OP',
'and': 'AND_OP',
'div': 'DIV_OP',
'mod': 'MOD_OP',
'or': 'LOGICAL_OP',
'and': 'LOGICAL_OP',
'div': 'DIVMOD_OP',
'mod': 'DIVMOD_OP',
}

tokens = [
Expand Down Expand Up @@ -40,7 +41,7 @@
'NODETEXTTEST',
'NAME',
'DOLLAR',
] + list(OPERATOR_NAMES.values())
] + list(set(OPERATOR_NAMES.values()))

t_PATH_SEP = r'/'
t_ABBREV_PATH_SEP = r'//'
Expand Down Expand Up @@ -80,6 +81,7 @@
def t_NAME(t):
# Check for operators
t.type = OPERATOR_NAMES.get(t.value, 'NAME')
# t.type = 'NAME'
return t

def t_LITERAL(t):
Expand Down
19 changes: 12 additions & 7 deletions pylib/uxml/uxpath/parserules.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
from .lexrules import tokens

precedence = (
('left', 'OR_OP'),
('left', 'AND_OP'),
('left', 'LOGICAL_OP'),
('left', 'EQUAL_OP'),
('left', 'REL_OP'),
('left', 'PLUS_OP', 'MINUS_OP'),
('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'),
('left', 'MULT_OP', 'DIVMOD_OP'),
('right', 'UMINUS_OP'),
('left', 'UNION_OP'),
)
Expand All @@ -25,15 +24,13 @@

def p_expr_boolean(p):
"""
Expr : Expr OR_OP Expr
| Expr AND_OP Expr
Expr : Expr LOGICAL_OP Expr
| Expr EQUAL_OP Expr
| Expr REL_OP Expr
| Expr PLUS_OP Expr
| Expr MINUS_OP Expr
| Expr MULT_OP Expr
| Expr DIV_OP Expr
| Expr MOD_OP Expr
| Expr DIVMOD_OP Expr
"""
p[0] = ast.BinaryExpression(p[1], p[2], p[3])

Expand Down Expand Up @@ -215,9 +212,13 @@ def p_name_test_star(p):
"""
p[0] = ast.NameTest('*')

# Note: element names which coincide with XPath operators (notably div)
# will come through as operator tokens, hence the non-straightforward production
def p_name_test_name(p):
"""
NameTest : NAME
| LOGICAL_OP
| DIVMOD_OP
"""
p[0] = ast.NameTest(p[1])

Expand Down Expand Up @@ -261,6 +262,8 @@ def p_predicate(p):
def p_variable_reference(p):
"""
VariableReference : DOLLAR NAME
| DOLLAR LOGICAL_OP
| DOLLAR DIVMOD_OP
"""
p[0] = ast.VariableReference(p[2])

Expand All @@ -282,6 +285,8 @@ def p_number(p):
def p_function_call(p):
"""
FunctionCall : NAME FormalArguments
| LOGICAL_OP FormalArguments
| DIVMOD_OP FormalArguments
"""
#Hacking around the ambiguity between node type test & function call
if p[1] in ('node', 'text'):
Expand Down
2 changes: 1 addition & 1 deletion pylib/uxml/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# http://legacy.python.org/dev/peps/pep-0440/
version_info = ('3', '2', '1')
version_info = ('3', '3', '0')
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Reluctantly use setuptools for now to get install_requires & long_description_content_type
$ python -c "import amara3; import amara3.iri; import amara3.uxml; import amara3.uxml.version; print(amara3.uxml.version.version_info)"
('3', '0', '1')
('3', '3', '0')
'''

import sys
Expand Down
9 changes: 9 additions & 0 deletions test/uxml/test_html5.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,14 @@ def test_xml_encode_with_comment():
assert root.xml_encode() == DOC2_NORMALIZED


DOC_NON_WF_XML = '<a x=1><b>Spam</b>'
DOC_NON_WF_XML_NORM = '<a x="1"><b>Spam</b></a>'

def test_non_wf_xml_parse():
root = html5.parse_lax_xml(io.StringIO(DOC_NON_WF_XML))
# Round trip
assert root.xml_encode() == DOC_NON_WF_XML_NORM


if __name__ == '__main__':
raise SystemExit("Run with py.test")
15 changes: 15 additions & 0 deletions test/uxml/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,20 @@ def test_basic_mutate(doc):
#FIXME: More testing


def test_add_remove():
# FIXME: Get the tree from fixture
tb = tree.treebuilder()
root = tb.parse(DOC1)
assert len(root.xml_children) == 3
new_elem_1 = element('dee', {'a': '1'})
root.xml_append(new_elem_1)
assert len(root.xml_children) == 4
root.xml_remove(new_elem_1)
assert len(root.xml_children) == 3
# Removal should be non-destructive of the removed child element
assert new_elem_1.xml_encode() == '<dee a="1"></dee>'
#FIXME: More testing


if __name__ == '__main__':
raise SystemExit("Run with py.test")

0 comments on commit 73f03f0

Please sign in to comment.