From 01a889f4a5c0bf81cd5b001887e41eb2b6967e7c Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Thu, 4 Jul 2019 21:24:56 -0600 Subject: [PATCH 1/6] Tiny tweaks --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 521559f..f1e5b21 100755 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- ''' -Highly recommend installing using `pip install .` not `python setup.py install` +Highly recommend installing using `pip install -U .` not `python setup.py install` Uses pkgutil-style namespace package (Working on figuring out PEP 420) @@ -19,7 +19,6 @@ from setuptools import setup, Extension #from distutils.core import setup, Extension #from distutils.core import Extension -import sys PROJECT_NAME = 'amara3.xml' PROJECT_DESCRIPTION = 'Amara3 project, which offers a variety of data processing tools. This module adds the MicroXML support, and adaptation to classic XML.' From d8b76cb62128b74c6a9eee989afb691ed63a5683 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Tue, 9 Jul 2019 12:27:39 -0600 Subject: [PATCH 2/6] Break out iterator XML tree parsers and improve naming, among other tweaks. --- .gitignore | 1 + pylib/uxml/tree.py | 137 +------------------------------------- pylib/uxml/treeiter.py | 147 +++++++++++++++++++++++++++++++++++++++++ pylib/uxml/version.py | 4 +- pylib/uxml/xml.py | 102 ++++------------------------ pylib/uxml/xmliter.py | 89 +++++++++++++++++++++++++ 6 files changed, 255 insertions(+), 225 deletions(-) create mode 100644 pylib/uxml/treeiter.py create mode 100644 pylib/uxml/xmliter.py diff --git a/.gitignore b/.gitignore index 92ef33c..8bec08f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ scratch MANIFEST #PyCharm .idea +.vscode #---- diff --git a/pylib/uxml/tree.py b/pylib/uxml/tree.py index b4f65e2..aa88c02 100644 --- a/pylib/uxml/tree.py +++ b/pylib/uxml/tree.py @@ -5,14 +5,14 @@ # # ----------------------------------------------------------------------------- -#See also: http://www.w3.org/community/microxml/wiki/MicroLarkApi +# See also: http://www.w3.org/community/microxml/wiki/MicroLarkApi import weakref from asyncio import coroutine from amara3.uxml.parser import parse, parser, parsefrags, event -#NO_PARENT = object() +# NO_PARENT = object() class node(object): @@ -196,138 +196,5 @@ def _elem_test(ev): return _elem_test -MATCHED_STATE = object() - - -class treesequence(object): - ''' - >>> from amara3.uxml import tree - >>> from asyncio import coroutine - >>> @coroutine - ... def sink(accumulator): - ... while True: - ... e = yield - ... accumulator.append(e.xml_value) - ... - >>> values = [] - >>> ts = tree.treesequence(('a', 'b'), sink(values)) - >>> ts.parse('123') - >>> values - ['1', '2', '3'] - ''' - def __init__(self, pattern, sink): - self._root = None - self._parent = None - self._pattern = pattern - self._states = None - self._evstack = [] - self._building_depth = 0 - self._sink = sink - next(sink) #Prime the coroutine - self._current = None - self._prep_pattern() - - def _only_name(self, next, name): - def _only_name_func(ev): - if ev[0] == event.start_element and ev[1] == name: - return next - return _only_name_func - - def _any_name(self, next): - def _any_name_func(ev): - if ev[0] == event.start_element: - return next - return _any_name_func - - def _any_until(self, next): - def _any_until_func(ev): - if ev[0] == event.start_element: - next_next = next(ev) - if next_next is not None: - return next_next - return _any_until_func - return _any_until_func - - def _any(self, next, funcs): - def _any_func(ev): - if any( (func(ev) for func in funcs) ): - return next - return _any_func - - def _prep_pattern(self): - next_state = MATCHED_STATE - for i in range(len(self._pattern)): - stage = self._pattern[-i-1] - if isinstance(stage, str): - if stage == '*': - next_state = self._any_name(next_state) - elif stage == '**': - next_state = self._any_until(next_state) - else: - next_state = self._only_name(next_state, stage) - elif isinstance(stage, tuple): - new_tuple = tuple(( name_test(substage) if isinstance(substage, str) else substage for substage in stage )) - next_state = self._any(next_state, new_tuple) - else: - raise ValueError('Cannot interpret pattern component {0}'.format(repr(stage))) - self._states = next_state - return - - def _match_state(self): - new_state = self._states - for depth, ev in enumerate(self._evstack): - new_state = new_state(ev) - if new_state == MATCHED_STATE: - return True - elif new_state is None: - return False - return False - - @coroutine - def _handler(self): - while True: - ev = yield - if ev[0] == event.start_element: - self._evstack.append(ev) - #Keep track of the depth while we're building elements. When we ge back to 0 depth, we're done for this subtree - if self._building_depth: - self._building_depth += 1 - elif self._match_state(): - self._building_depth = 1 - if self._building_depth: - new_element = element(ev[1], ev[2], self._parent) - #if self._parent: self._parent().xml_children.append(weakref.ref(new_element)) - #Note: not using weakrefs here because these refs are not circular - if self._parent: self._parent.xml_children.append(new_element) - self._parent = new_element - #Hold a reference to the top element of the subtree being built, - #or it will be garbage collected as the builder moves down the tree - if self._building_depth == 1: self._root = new_element - elif ev[0] == event.characters: - if self._building_depth: - new_text = text(ev[1], self._parent) - if self._parent: self._parent.xml_children.append(new_text) - elif ev[0] == event.end_element: - self._evstack.pop() - if self._building_depth: - self._building_depth -= 1 - #Done with this subtree - if not self._building_depth: - self._sink.send(self._parent) - #Pop back up in element ancestry - if self._parent: - self._parent = self._parent.xml_parent - - #print(ev, self._building_depth, self._evstack) - return - - def parse(self, doc): - h = self._handler() - p = parser(h) - p.send((doc, False)) - p.send(('', True)) #Wrap it up - return - - def parse(doc): return treebuilder().parse(doc) diff --git a/pylib/uxml/treeiter.py b/pylib/uxml/treeiter.py new file mode 100644 index 0000000..fe44341 --- /dev/null +++ b/pylib/uxml/treeiter.py @@ -0,0 +1,147 @@ +# ----------------------------------------------------------------------------- +# amara3.uxml.treeiter +# +# Iterator (generator and coroutine) facilities for MicroXML tree objects +# +# ----------------------------------------------------------------------------- + +import asyncio + +from .parser import parser, parsefrags, event +from .tree import element, text + + +MATCHED_STATE = object() + + +class sender: + ''' + Parser object that feeds a coroutine with tree fragments based on an element pattern + + >>> from amara3.uxml import treeiter + ... def sink(accumulator): + ... while True: + ... e = yield + ... accumulator.append(e.xml_value) + ... + >>> values = [] + >>> ts = treeiter.sender(('a', 'b'), sink(values)) + >>> ts.parse('123') + >>> values + ['1', '2', '3'] + ''' + def __init__(self, pattern, sink, prime_sink=True): + self._root = None + self._parent = None + self._pattern = pattern + self._states = None + self._evstack = [] + self._building_depth = 0 + self._sink = sink + #if asyncio.iscoroutine(sink): + if prime_sink: + next(sink) # Prime coroutine + self._current = None + self._prep_pattern() + + def _only_name(self, next, name): + def _only_name_func(ev): + if ev[0] == event.start_element and ev[1] == name: + return next + return _only_name_func + + def _any_name(self, next): + def _any_name_func(ev): + if ev[0] == event.start_element: + return next + return _any_name_func + + def _any_until(self, next): + def _any_until_func(ev): + if ev[0] == event.start_element: + next_next = next(ev) + if next_next is not None: + return next_next + return _any_until_func + return _any_until_func + + def _any(self, next, funcs): + def _any_func(ev): + if any( (func(ev) for func in funcs) ): + return next + return _any_func + + def _prep_pattern(self): + next_state = MATCHED_STATE + for i in range(len(self._pattern)): + stage = self._pattern[-i-1] + if isinstance(stage, str): + if stage == '*': + next_state = self._any_name(next_state) + elif stage == '**': + next_state = self._any_until(next_state) + else: + next_state = self._only_name(next_state, stage) + elif isinstance(stage, tuple): + new_tuple = tuple(( name_test(substage) if isinstance(substage, str) else substage for substage in stage )) + next_state = self._any(next_state, new_tuple) + else: + raise ValueError('Cannot interpret pattern component {0}'.format(repr(stage))) + self._states = next_state + return + + def _match_state(self): + new_state = self._states + for depth, ev in enumerate(self._evstack): + new_state = new_state(ev) + if new_state == MATCHED_STATE: + return True + elif new_state is None: + return False + return False + + @asyncio.coroutine + def _handler(self): + while True: + ev = yield + if ev[0] == event.start_element: + self._evstack.append(ev) + #Keep track of the depth while we're building elements. When we ge back to 0 depth, we're done for this subtree + if self._building_depth: + self._building_depth += 1 + elif self._match_state(): + self._building_depth = 1 + if self._building_depth: + new_element = element(ev[1], ev[2], self._parent) + #if self._parent: self._parent().xml_children.append(weakref.ref(new_element)) + #Note: not using weakrefs here because these refs are not circular + if self._parent: self._parent.xml_children.append(new_element) + self._parent = new_element + #Hold a reference to the top element of the subtree being built, + #or it will be garbage collected as the builder moves down the tree + if self._building_depth == 1: self._root = new_element + elif ev[0] == event.characters: + if self._building_depth: + new_text = text(ev[1], self._parent) + if self._parent: self._parent.xml_children.append(new_text) + elif ev[0] == event.end_element: + self._evstack.pop() + if self._building_depth: + self._building_depth -= 1 + #Done with this subtree + if not self._building_depth: + self._sink.send(self._parent) + #Pop back up in element ancestry + if self._parent: + self._parent = self._parent.xml_parent + + #print(ev, self._building_depth, self._evstack) + return + + def parse(self, doc): + h = self._handler() + p = parser(h) + p.send((doc, False)) + p.send(('', True)) #Wrap it up + return + diff --git a/pylib/uxml/version.py b/pylib/uxml/version.py index b47814d..795fd3a 100644 --- a/pylib/uxml/version.py +++ b/pylib/uxml/version.py @@ -1,2 +1,2 @@ -#http://legacy.python.org/dev/peps/pep-0440/ -version_info = ('3', '0', '2') +# http://legacy.python.org/dev/peps/pep-0440/ +version_info = ('3', '1', '0') diff --git a/pylib/uxml/xml.py b/pylib/uxml/xml.py index d198f22..9d1b8ca 100644 --- a/pylib/uxml/xml.py +++ b/pylib/uxml/xml.py @@ -1,20 +1,25 @@ +# ----------------------------------------------------------------------------- # amara3.uxml.xml +# +# MicroXML tree objects parsed from XML sources +# +# ----------------------------------------------------------------------------- -#import asyncio -from asyncio import coroutine +import asyncio import xml.parsers.expat -from xml.sax.saxutils import escape #also quoteattr? +from xml.sax.saxutils import escape # also quoteattr? -from amara3.uxml import tree -from amara3.uxml.parser import parse, parser, parsefrags, event +from . import tree +from .parser import parser, parsefrags, event class expat_callbacks(object): - def __init__(self, handler, asyncio_based_handler=True): + def __init__(self, handler, prime_handler=True): self._handler = handler self._elem_stack = [] - if asyncio_based_handler: - next(self._handler) #Start the coroutine running + #if asyncio.iscoroutine(handler): + if prime_handler: + next(handler) # Prime coroutine return def start_element(self, name, attrs): @@ -102,15 +107,7 @@ def parse(source, handler): return p -@coroutine -def buffer_handler(accumulator): - while True: - event = yield - accumulator.append(event) - return - - -#Tree-based tools +# Tree-based tools class treebuilder(tree.treebuilder): ''' @@ -132,74 +129,3 @@ def parse(self, source): self.expat_parser.Parse(source) return self._root - -''' -from asyncio import coroutine -from amara3.uxml import xml -@coroutine -def sink(accumulator): - while True: - e = yield - accumulator.append(e.xml_value) - -values = [] -ts = xml.treesequence(('a', 'b'), sink(values)) -ts.parse('123') -values - ----- - -from asyncio import coroutine -from amara3.uxml import tree -from amara3.uxml.treeutil import * - -def ppath(start, path): - print((start, path)) - if not path: return None - if len(path) == 1: - yield from select_name(start, path[0]) - else: - for e in select_name(start, path[0]): - yield from ppath(e, path[1:]) - -ts = tree.treebuilder() -root = ts.parse('123') -pathresults = ppath(root, ('b', 'c')) -print(list(pathresults)) -''' - -class treesequence(tree.treesequence): - ''' - >>> from asyncio import coroutine - >>> from amara3.uxml import xml - >>> @coroutine - ... def sink(accumulator): - ... while True: - ... e = yield - ... accumulator.append(e.xml_value) - ... - >>> values = [] - >>> ts = xml.treesequence(('a', 'b'), sink(values)) - >>> ts.parse('123') - >>> values - ['1', '2', '3'] - ''' - def __init__(self, pattern, sink, callbacks=expat_callbacks): - super(treesequence, self).__init__(pattern, sink) - self.handler = callbacks(self._handler()) - self.expat_parser = xml.parsers.expat.ParserCreate(namespace_separator=' ') - - self.expat_parser.StartElementHandler = self.handler.start_element - self.expat_parser.EndElementHandler = self.handler.end_element - self.expat_parser.CharacterDataHandler = self.handler.char_data - self.expat_parser.StartNamespaceDeclHandler = self.handler.start_namespace - self.expat_parser.EndNamespaceDeclHandler = self.handler.end_namespace - return - - def parse(self, source): - self.expat_parser.Parse(source) - return - - def parse_file(self, fp): - self.expat_parser.ParseFile(fp) - return diff --git a/pylib/uxml/xmliter.py b/pylib/uxml/xmliter.py new file mode 100644 index 0000000..b7bf947 --- /dev/null +++ b/pylib/uxml/xmliter.py @@ -0,0 +1,89 @@ +# ----------------------------------------------------------------------------- +# amara3.uxml.xmliter +# +# Iterator (generator and coroutine) facilities for MicroXML tree objects parsed from XML sources +# +# ----------------------------------------------------------------------------- + +import asyncio +import xml.parsers.expat + +from . import treeiter +from .xml import expat_callbacks, ns_expat_callbacks + + +def buffer_handler(accumulator): + while True: + event = yield + accumulator.append(event) + return + + +''' +from asyncio import coroutine +from amara3.uxml import xml +@coroutine +def sink(accumulator): + while True: + e = yield + accumulator.append(e.xml_value) + +values = [] +ts = xml.treesequence(('a', 'b'), sink(values)) +ts.parse('123') +values + +---- + +from asyncio import coroutine +from amara3.uxml import tree +from amara3.uxml.treeutil import * + +def ppath(start, path): + print((start, path)) + if not path: return None + if len(path) == 1: + yield from select_name(start, path[0]) + else: + for e in select_name(start, path[0]): + yield from ppath(e, path[1:]) + +ts = tree.treebuilder() +root = ts.parse('123') +pathresults = ppath(root, ('b', 'c')) +print(list(pathresults)) +''' + +class sender(treeiter.sender): + ''' + >>> from amara3.uxml import xml + ... def sink(accumulator): + ... while True: + ... e = yield + ... accumulator.append(e.xml_value) + ... + >>> values = [] + >>> ts = xmliter.sender(('a', 'b'), sink(values)) + >>> ts.parse('123') + >>> values + ['1', '2', '3'] + ''' + def __init__(self, pattern, sink, callbacks=expat_callbacks): + super(sender, self).__init__(pattern, sink) + self.handler = callbacks(self._handler()) + self.expat_parser = xml.parsers.expat.ParserCreate(namespace_separator=' ') + + self.expat_parser.StartElementHandler = self.handler.start_element + self.expat_parser.EndElementHandler = self.handler.end_element + self.expat_parser.CharacterDataHandler = self.handler.char_data + self.expat_parser.StartNamespaceDeclHandler = self.handler.start_namespace + self.expat_parser.EndNamespaceDeclHandler = self.handler.end_namespace + return + + def parse(self, source): + self.expat_parser.Parse(source) + return + + def parse_file(self, fp): + self.expat_parser.ParseFile(fp) + return From 4c927629d0162165b676a59cd61c6822bad60981 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Tue, 9 Jul 2019 17:31:19 -0600 Subject: [PATCH 3/6] Fixes to break-out of iterating libs. --- pylib/uxml/html5.py | 38 +++++--------------------------------- pylib/uxml/html5iter.py | 33 +++++++++++++++++++++++++++++++++ pylib/uxml/tree.py | 24 ++++++++++++++++++++++-- pylib/uxml/treeiter.py | 2 +- pylib/uxml/xmliter.py | 35 ----------------------------------- setup.py | 1 + 6 files changed, 62 insertions(+), 71 deletions(-) create mode 100644 pylib/uxml/html5iter.py diff --git a/pylib/uxml/html5.py b/pylib/uxml/html5.py index 131cafc..69ebe57 100644 --- a/pylib/uxml/html5.py +++ b/pylib/uxml/html5.py @@ -11,11 +11,11 @@ import copy import itertools import weakref -#from functools import * -#from itertools import * -from amara3.uxml import tree -from amara3.uxml.treeutil import * +from . import tree +from . import treeiter +from .treeutil import * +#from . import xmliter try: import html5lib @@ -252,7 +252,7 @@ def get_tree_instance(namespaceHTMLElements, use_xhtml_ns=use_xhtml_ns): def markup_fragment(source, encoding=None): ''' - Parse a fragment if markup in HTML mode, and return a bindery node + Parse a fragment of markup in HTML mode, and return a tree node Warning: if you pass a string, you must make sure it's a byte string, not a Unicode object. You might also want to wrap it with amara.lib.inputsource.text if it's not obviously XML or HTML (for example it could be confused with a file name) @@ -266,31 +266,3 @@ def markup_fragment(source, encoding=None): frag = doc.html.body return frag - -class treesequence(tree.treesequence): - ''' - >>> from asyncio import coroutine - >>> from amara3.uxml import xml - >>> @coroutine - ... def sink(accumulator): - ... while True: - ... e = yield - ... accumulator.append(e.xml_value) - ... - >>> values = [] - >>> ts = xml.treesequence(('html', 'body', 'ul', 'li'), sink(values)) - >>> ts.parse('</head><body><ul><li>1</li><li>2</li><li>3</li></ul></body>') - >>> values - ['1', '2', '3'] - ''' - #def __init__(self, pattern, sink): - # super(treesequence, self).__init__(pattern, sink) - # return - - def parse(self, source): - self.expat_parser.Parse(source) - return - - def parse_file(self, fp): - self.expat_parser.ParseFile(fp) - return diff --git a/pylib/uxml/html5iter.py b/pylib/uxml/html5iter.py new file mode 100644 index 0000000..b1b9ea1 --- /dev/null +++ b/pylib/uxml/html5iter.py @@ -0,0 +1,33 @@ +######################################################################## +# amara3.uxml.htmliter +""" + +""" + +from . import tree +from . import treeiter +from .treeutil import * +from . import html5 + + +class sender(treeiter.sender): + ''' + >>> from amara3.uxml import html5iter + ... def sink(accumulator): + ... while True: + ... e = yield + ... accumulator.append(e.xml_value) + ... + >>> values = [] + >>> ts = html5iter.sender(('html', 'body', 'ul', 'li'), sink(values)) + >>> ts.parse('<html><head><title></head><body><ul><li>1</li><li>2</li><li>3</li></ul></body>') + >>> values + ['1', '2', '3'] + ''' + def parse(self, doc): + h = self._handler() + p = html5.parser(h) + p.send((doc, False)) + p.send(('', True)) # Wrap it up + return + diff --git a/pylib/uxml/tree.py b/pylib/uxml/tree.py index aa88c02..06e5509 100644 --- a/pylib/uxml/tree.py +++ b/pylib/uxml/tree.py @@ -8,7 +8,7 @@ # See also: http://www.w3.org/community/microxml/wiki/MicroLarkApi import weakref -from asyncio import coroutine +import asyncio from amara3.uxml.parser import parse, parser, parsefrags, event @@ -153,7 +153,7 @@ def __init__(self): self._root = None self._parent = None - @coroutine + @asyncio.coroutine def _handler(self): while True: ev = yield @@ -198,3 +198,23 @@ def _elem_test(ev): def parse(doc): return treebuilder().parse(doc) + + +''' +from amara3.uxml import tree +from amara3.uxml.treeutil import * + +def ppath(start, path): + print((start, path)) + if not path: return None + if len(path) == 1: + yield from select_name(start, path[0]) + else: + for e in select_name(start, path[0]): + yield from ppath(e, path[1:]) + +root = tree.parse('<a xmlns="urn:namespaces:suck"><b><c>1</c></b><b>2</b><b>3</b></a>') +pathresults = ppath(root, ('b', 'c')) +print(list(pathresults)) +''' + diff --git a/pylib/uxml/treeiter.py b/pylib/uxml/treeiter.py index fe44341..f7d2d23 100644 --- a/pylib/uxml/treeiter.py +++ b/pylib/uxml/treeiter.py @@ -142,6 +142,6 @@ def parse(self, doc): h = self._handler() p = parser(h) p.send((doc, False)) - p.send(('', True)) #Wrap it up + p.send(('', True)) # Wrap it up return diff --git a/pylib/uxml/xmliter.py b/pylib/uxml/xmliter.py index b7bf947..4277731 100644 --- a/pylib/uxml/xmliter.py +++ b/pylib/uxml/xmliter.py @@ -19,41 +19,6 @@ def buffer_handler(accumulator): return -''' -from asyncio import coroutine -from amara3.uxml import xml -@coroutine -def sink(accumulator): - while True: - e = yield - accumulator.append(e.xml_value) - -values = [] -ts = xml.treesequence(('a', 'b'), sink(values)) -ts.parse('<a xmlns="urn:namespaces:suck"><b>1</b><b>2</b><b>3</b></a>') -values - ----- - -from asyncio import coroutine -from amara3.uxml import tree -from amara3.uxml.treeutil import * - -def ppath(start, path): - print((start, path)) - if not path: return None - if len(path) == 1: - yield from select_name(start, path[0]) - else: - for e in select_name(start, path[0]): - yield from ppath(e, path[1:]) - -ts = tree.treebuilder() -root = ts.parse('<a xmlns="urn:namespaces:suck"><b><c>1</c></b><b>2</b><b>3</b></a>') -pathresults = ppath(root, ('b', 'c')) -print(list(pathresults)) -''' - class sender(treeiter.sender): ''' >>> from amara3.uxml import xml diff --git a/setup.py b/setup.py index f1e5b21..a4d941b 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ 'nameparser', 'pytest', 'ply', + 'html5lib', ] # From http://pypi.python.org/pypi?%3Aaction=list_classifiers From 0d4f51072e9ae4f730de2b11906ae9f3e2237397 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji <uche@ogbuji.net> Date: Mon, 9 Sep 2019 11:33:57 -0600 Subject: [PATCH 4/6] Implement operators or/and --- pylib/uxml/uxpath/__init__.py | 4 ++-- pylib/uxml/uxpath/ast.py | 22 ++++++++++++++++++---- pylib/uxml/uxpath/lexrules.py | 12 +++++++++--- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/pylib/uxml/uxpath/__init__.py b/pylib/uxml/uxpath/__init__.py index d2be422..0144247 100644 --- a/pylib/uxml/uxpath/__init__.py +++ b/pylib/uxml/uxpath/__init__.py @@ -100,7 +100,7 @@ def copy(self, item=None, pos=None, variables=None, functions=None, lookuptables return context(item, pos=pos, variables=variables, functions=functions, lookuptables=lookuptables, extras=extras, parent=parent, force_root=False) -def qquery(xml_thing, xpath_thing, vars=None, funcs=None): +def qquery(xml_thing, xpath_thing, vars=None, funcs=None, force_root=True): ''' Quick query. Convenience for using the MicroXPath engine. Give it some XML and an expression and it will yield the results. No fuss. @@ -130,7 +130,7 @@ def qquery(xml_thing, xpath_thing, vars=None, funcs=None): if not root: return if isinstance(xpath_thing, str): parsed_expr = parse(xpath_thing) - ctx = context(root, variables=vars, functions=funcs) + ctx = context(root, variables=vars, functions=funcs, force_root=force_root) result = parsed_expr.compute(ctx) yield from result diff --git a/pylib/uxml/uxpath/ast.py b/pylib/uxml/uxpath/ast.py index caea119..a7ce3f0 100644 --- a/pylib/uxml/uxpath/ast.py +++ b/pylib/uxml/uxpath/ast.py @@ -272,17 +272,19 @@ def compute(self, ctx): selected.sort(key=operator.attrgetter('_docorder')) yield from selected - #FIXME: A lot of work to do on comparisons + # FIXME: A lot of work to do on comparisons elif self.op == '=': lhs = self.left.compute(ctx) rhs = self.right.compute(ctx) - #print(ctx.item, list(self.left.compute(ctx)), list(self.right.compute(ctx))) + # print(ctx.item, list(self.left.compute(ctx)), list(self.right.compute(ctx))) + # If LHS is a node sequence, check comparison on each item for i in lhs: for j in rhs: i = i.xml_value if isinstance(i, node) else i j = i.xml_value if isinstance(j, node) else j - if i == j: yield True - return + if i == j: + yield True + return yield False elif self.op == '!=': lhs = self.left.compute(ctx) @@ -304,6 +306,18 @@ def compute(self, ctx): lhs = self.left.compute(ctx) rhs = self.right.compute(ctx) yield next(lhs) <= next(rhs) + elif self.op == 'or': + lhs = self.left.compute(ctx) + rhs = self.right.compute(ctx) + yield next(lhs) or next(rhs) + elif self.op == 'and': + lhs = self.left.compute(ctx) + rhs = self.right.compute(ctx) + #lhs_val = next(lhs, None) + #rhs_val = next(rhs, None) + #print((self.left, lhs_val, self.right, rhs_val)) + #yield lhs_val and rhs_val + yield next(lhs) and next(rhs) else: raise NotImplementedErr('Oops! Operator "{}" not yet implemented'.format(self.op)) return diff --git a/pylib/uxml/uxpath/lexrules.py b/pylib/uxml/uxpath/lexrules.py index 358589c..c53b654 100644 --- a/pylib/uxml/uxpath/lexrules.py +++ b/pylib/uxml/uxpath/lexrules.py @@ -7,7 +7,8 @@ import re from ply.lex import TOKEN -operator_names = { + +OPERATOR_NAMES = { 'or': 'OR_OP', 'and': 'AND_OP', 'div': 'DIV_OP', @@ -39,7 +40,7 @@ 'NODETEXTTEST', 'NAME', 'DOLLAR', - ] + list(operator_names.values()) + ] + list(OPERATOR_NAMES.values()) t_PATH_SEP = r'/' t_ABBREV_PATH_SEP = r'//' @@ -60,6 +61,7 @@ t_DOLLAR = r'\$' t_STAR_OP = r'\*' + t_ignore = ' \t\r\n' NameStartChar = r'(' + r'[A-Z]|_|[a-z]|\xc0-\xd6]|[\xd8-\xf6]|[\xf8-\u02ff]|' + \ @@ -74,7 +76,11 @@ NODE_TYPES = set(['text', 'node']) -t_NAME = NAME_REGEX +@TOKEN(NAME_REGEX) +def t_NAME(t): + # Check for operators + t.type = OPERATOR_NAMES.get(t.value, 'NAME') + return t def t_LITERAL(t): r""""[^"]*"|'[^']*'""" From 1fad4bde7bc94b78f3359ff900a01414dec6bf0a Mon Sep 17 00:00:00 2001 From: Uche Ogbuji <uche@ogbuji.net> Date: Tue, 17 Sep 2019 22:57:13 -0600 Subject: [PATCH 5/6] Fix element.xml_write() --- exec/microx | 4 +- pylib/uxml/tree.py | 13 +- pylib/uxml/treeiter.py | 148 ++++++++++-------- ...{test_treesequence.py => test_treeiter.py} | 21 ++- 4 files changed, 108 insertions(+), 78 deletions(-) rename test/uxml/{test_treesequence.py => test_treeiter.py} (67%) diff --git a/exec/microx b/exec/microx index 2944051..cf7ef7d 100755 --- a/exec/microx +++ b/exec/microx @@ -111,7 +111,7 @@ P = TB.parse def xpath_to(node, show_attrs): ancestors = [] - parent = node.parent + parent = node.xml_parent while parent: step = parent.xml_name for sattr in show_attrs: @@ -119,7 +119,7 @@ def xpath_to(node, show_attrs): step += f'[@{sattr}="{parent.xml_attributes[sattr]}"]' ancestors.append(step) node = parent - parent = node.parent + parent = node.xml_parent ancestors.reverse() xp = '/'.join(ancestors) return xp diff --git a/pylib/uxml/tree.py b/pylib/uxml/tree.py index 06e5509..0146bfe 100644 --- a/pylib/uxml/tree.py +++ b/pylib/uxml/tree.py @@ -9,6 +9,7 @@ import weakref import asyncio +from xml.sax.saxutils import escape, quoteattr from amara3.uxml.parser import parse, parser, parsefrags, event @@ -45,9 +46,17 @@ def __init__(self, name, attrs=None, parent=None):#, ancestors=None): return def xml_encode(self, indent=None, depth=0): + ''' + Unparse an object back to XML text, returning the string object + + >>> from amara3.uxml.tree import parse + >>> e = parse('<a>bc&de</a>') + >>> e.xml_encode() + '<a>bc&de</a>' + ''' strbits = ['<', self.xml_name] for aname, aval in self.xml_attributes.items(): - strbits.extend([' ', aname, '="', aval, '"']) + strbits.extend([' ', aname, '=', quoteattr(aval)]) strbits.append('>') if indent: strbits.append('\n') @@ -59,7 +68,7 @@ def xml_encode(self, indent=None, depth=0): strbits.append('\n') strbits.append(indent*depth) else: - strbits.append(child) + strbits.append(escape(child)) strbits.extend(['</', self.xml_name, '>']) return ''.join(strbits) diff --git a/pylib/uxml/treeiter.py b/pylib/uxml/treeiter.py index f7d2d23..d723b74 100644 --- a/pylib/uxml/treeiter.py +++ b/pylib/uxml/treeiter.py @@ -30,19 +30,37 @@ class sender: >>> values ['1', '2', '3'] ''' - def __init__(self, pattern, sink, prime_sink=True): - self._root = None - self._parent = None - self._pattern = pattern - self._states = None - self._evstack = [] - self._building_depth = 0 - self._sink = sink + def __init__(self, patterns, sinks, prime_sinks=True): + ''' + Initializer + + Params: + patterns - pattern or list of patterns for subtrees to be generated, + each a tuple of element names, or the special wildcards '*' or '**' + '*' matches any single element. '**' matches any nestign of elements to arbitrary depth. + Resulting subtrees are sent to the corrersponding sink coroutine, + so number of patterns must match number of sinks + sinks - coroutine to be sent element subtrees as generated from parse. + Each coroutine receives subtrees based on the corrersponding pattern, + so number of patterns must match number of sinks + prime_sinks - if True call next() on each coroutine to get it started + ''' + self._patterns = [patterns] if isinstance(patterns, tuple) and isinstance(patterns[0], str) else patterns + self._pattern_count = len(self._patterns) + self._sinks = sinks if isinstance(sinks, list) or isinstance(sinks, tuple) else [sinks] + if len(self._sinks) != self._pattern_count: + raise ValueError('Number of patterns must match number of sinks') + self._roots = [None] * self._pattern_count + self._parents = [None] * self._pattern_count + self._stateses = [None] * self._pattern_count + self._evstacks = [[]] * self._pattern_count + self._building_depths = [0] * self._pattern_count #if asyncio.iscoroutine(sink): - if prime_sink: - next(sink) # Prime coroutine - self._current = None - self._prep_pattern() + if prime_sinks: + for sink in self._sinks: + next(sink) # Prime coroutine + self._currents = [None] * self._pattern_count + self._prep_patterns() def _only_name(self, next, name): def _only_name_func(ev): @@ -71,28 +89,29 @@ def _any_func(ev): return next return _any_func - def _prep_pattern(self): + def _prep_patterns(self): next_state = MATCHED_STATE - for i in range(len(self._pattern)): - stage = self._pattern[-i-1] - if isinstance(stage, str): - if stage == '*': - next_state = self._any_name(next_state) - elif stage == '**': - next_state = self._any_until(next_state) + for ix, pattern in enumerate(self._patterns): + for i in range(len(pattern)): + stage = pattern[-i-1] + if isinstance(stage, str): + if stage == '*': + next_state = self._any_name(next_state) + elif stage == '**': + next_state = self._any_until(next_state) + else: + next_state = self._only_name(next_state, stage) + elif isinstance(stage, tuple): + new_tuple = tuple(( name_test(substage) if isinstance(substage, str) else substage for substage in stage )) + next_state = self._any(next_state, new_tuple) else: - next_state = self._only_name(next_state, stage) - elif isinstance(stage, tuple): - new_tuple = tuple(( name_test(substage) if isinstance(substage, str) else substage for substage in stage )) - next_state = self._any(next_state, new_tuple) - else: - raise ValueError('Cannot interpret pattern component {0}'.format(repr(stage))) - self._states = next_state + raise ValueError('Cannot interpret pattern component {0}'.format(repr(stage))) + self._stateses[ix] = next_state return - def _match_state(self): - new_state = self._states - for depth, ev in enumerate(self._evstack): + def _match_state(self, ix): + new_state = self._stateses[ix] + for depth, ev in enumerate(self._evstacks[ix]): new_state = new_state(ev) if new_state == MATCHED_STATE: return True @@ -104,38 +123,43 @@ def _match_state(self): def _handler(self): while True: ev = yield - if ev[0] == event.start_element: - self._evstack.append(ev) - #Keep track of the depth while we're building elements. When we ge back to 0 depth, we're done for this subtree - if self._building_depth: - self._building_depth += 1 - elif self._match_state(): - self._building_depth = 1 - if self._building_depth: - new_element = element(ev[1], ev[2], self._parent) - #if self._parent: self._parent().xml_children.append(weakref.ref(new_element)) - #Note: not using weakrefs here because these refs are not circular - if self._parent: self._parent.xml_children.append(new_element) - self._parent = new_element - #Hold a reference to the top element of the subtree being built, - #or it will be garbage collected as the builder moves down the tree - if self._building_depth == 1: self._root = new_element - elif ev[0] == event.characters: - if self._building_depth: - new_text = text(ev[1], self._parent) - if self._parent: self._parent.xml_children.append(new_text) - elif ev[0] == event.end_element: - self._evstack.pop() - if self._building_depth: - self._building_depth -= 1 - #Done with this subtree - if not self._building_depth: - self._sink.send(self._parent) - #Pop back up in element ancestry - if self._parent: - self._parent = self._parent.xml_parent - - #print(ev, self._building_depth, self._evstack) + for ix, evstack in enumerate(self._evstacks): + building_depth = self._building_depths[ix] + parent = self._parents[ix] + if ev[0] == event.start_element: + evstack.append(ev) + #Keep track of the depth while we're building elements. When we ge back to 0 depth, we're done for this subtree + if building_depth: + building_depth += 1 + self._building_depths[ix] = building_depth + elif self._match_state(ix): + building_depth = self._building_depths[ix] = 1 + if building_depth: + new_element = element(ev[1], ev[2], parent) + #if parent: parent().xml_children.append(weakref.ref(new_element)) + #Note: not using weakrefs here because these refs are not circular + if parent: parent.xml_children.append(new_element) + parent = self._parents[ix] = new_element + #Hold a reference to the top element of the subtree being built, + #or it will be garbage collected as the builder moves down the tree + if building_depth == 1: self._roots[ix] = new_element + elif ev[0] == event.characters: + if building_depth: + new_text = text(ev[1], parent) + if parent: parent.xml_children.append(new_text) + elif ev[0] == event.end_element: + evstack.pop() + if building_depth: + building_depth -= 1 + self._building_depths[ix] = building_depth + #Done with this subtree + if not building_depth: + self._sinks[ix].send(parent) + #Pop back up in element ancestry + if parent: + parent = self._parents[ix] = parent.xml_parent + + #print(ev, building_depth, evstack) return def parse(self, doc): diff --git a/test/uxml/test_treesequence.py b/test/uxml/test_treeiter.py similarity index 67% rename from test/uxml/test_treesequence.py rename to test/uxml/test_treeiter.py index 3a84c15..4789955 100644 --- a/test/uxml/test_treesequence.py +++ b/test/uxml/test_treeiter.py @@ -1,7 +1,5 @@ -from asyncio import coroutine - import pytest #Consider also installing pytest_capturelog -from amara3.uxml import tree +from amara3.uxml import treeiter DOC1 = '<a><b>1</b><b>2</b><b>3</b></a>' @@ -10,49 +8,48 @@ DOC4 = '<a><b><x>1</x></b><c><x>2</x><d><x>3</x></d></c><x>4</x><y>5</y></a>' def test_ts_basics(): - @coroutine def sink(accumulator): while True: e = yield accumulator.append(e.xml_value) values = [] - ts = tree.treesequence(('a', 'b'), sink(values)) + ts = treeiter.sender(('a', 'b'), sink(values)) ts.parse(DOC1) assert values == ['1', '2', '3'] values = [] - ts = tree.treesequence(('a', '*'), sink(values)) + ts = treeiter.sender(('a', '*'), sink(values)) ts.parse(DOC1) assert values == ['1', '2', '3'] values = [] - ts = tree.treesequence(('a', ('b', 'c')), sink(values)) + ts = treeiter.sender(('a', ('b', 'c')), sink(values)) ts.parse(DOC2) assert values == ['1', '2'] values = [] - ts = tree.treesequence(('a', '**', 'x'), sink(values)) + ts = treeiter.sender(('a', '**', 'x'), sink(values)) ts.parse(DOC3) assert values == ['1', '2', '3', '4'] values = [] - ts = tree.treesequence(('*', '**', 'x'), sink(values)) + ts = treeiter.sender(('*', '**', 'x'), sink(values)) ts.parse(DOC3) assert values == ['1', '2', '3', '4'] values = [] - ts = tree.treesequence(('a', '**', 'x'), sink(values)) + ts = treeiter.sender(('a', '**', 'x'), sink(values)) ts.parse(DOC4) assert values == ['1', '2', '3', '4'] values = [] - ts = tree.treesequence(('*', '**', 'x'), sink(values)) + ts = treeiter.sender(('*', '**', 'x'), sink(values)) ts.parse(DOC4) assert values == ['1', '2', '3', '4'] values = [] - ts = tree.treesequence(('*', '*'), sink(values)) + ts = treeiter.sender(('*', '*'), sink(values)) ts.parse(DOC3) assert values == ['1', '23', '4'] From d9914f10fd4116df25e671dfd787a45ca62e5e21 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji <uche@ogbuji.net> Date: Sat, 9 Nov 2019 13:22:00 -0700 Subject: [PATCH 6/6] Fixes to treeiter. Fixes to uXPath node tests. Test case fixes. --- pylib/uxml/treeiter.py | 7 ++++--- pylib/uxml/uxpath/ast.py | 7 ++++--- pylib/uxml/uxpath/parserules.py | 12 +++++++----- pylib/uxml/xmliter.py | 6 +++--- test/uxml/test_treegc.py | 8 +++++--- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/pylib/uxml/treeiter.py b/pylib/uxml/treeiter.py index d723b74..8ef79b3 100644 --- a/pylib/uxml/treeiter.py +++ b/pylib/uxml/treeiter.py @@ -6,9 +6,10 @@ # ----------------------------------------------------------------------------- import asyncio +import collections from .parser import parser, parsefrags, event -from .tree import element, text +from .tree import element, text, name_test MATCHED_STATE = object() @@ -58,7 +59,8 @@ def __init__(self, patterns, sinks, prime_sinks=True): #if asyncio.iscoroutine(sink): if prime_sinks: for sink in self._sinks: - next(sink) # Prime coroutine + if isinstance(sink, collections.Iterable): + next(sink) # Prime coroutine self._currents = [None] * self._pattern_count self._prep_patterns() @@ -168,4 +170,3 @@ def parse(self, doc): p.send((doc, False)) p.send(('', True)) # Wrap it up return - diff --git a/pylib/uxml/uxpath/ast.py b/pylib/uxml/uxpath/ast.py index a7ce3f0..d025865 100644 --- a/pylib/uxml/uxpath/ast.py +++ b/pylib/uxml/uxpath/ast.py @@ -15,7 +15,7 @@ 'AbsolutePath', 'Step', 'NameTest', - 'NodeType', + 'NodeTypeTest', 'AbbreviatedStep', 'VariableReference', 'FunctionCall', @@ -24,7 +24,7 @@ import operator import functools -from collections import Iterable +from collections.abc import Iterable from amara3.uxml.tree import node, element, strval from amara3.uxml.treeutil import descendants @@ -582,7 +582,8 @@ def __call__(self, ctx): yield from self.compute(ctx) def compute(self, ctx): - if self.name == 'node' or isinstance(ctx.item, str): + if (self.name == 'node' and isinstance(ctx.item, node)) \ + or isinstance(ctx.item, str): yield ctx.item diff --git a/pylib/uxml/uxpath/parserules.py b/pylib/uxml/uxpath/parserules.py index a14a43a..89f449c 100644 --- a/pylib/uxml/uxpath/parserules.py +++ b/pylib/uxml/uxpath/parserules.py @@ -196,12 +196,14 @@ def p_node_test_name_test(p): p[0] = p[1] def p_node_test_type(p): + #NodeTest : NODETEXTTEST """ - NodeTest : NODETEXTTEST + NodeTest : FunctionCall """ - #assert p[1] in ('node', 'text') - #raise RuntimeError("Invalid node type '{0}'".format(p[1])) - p[0] = ast.NodeTypeTest(p[1]) + # FIXME: Also check no args + if p[1].name not in ('node', 'text'): + raise RuntimeError("Invalid node test '{0}'".format(p[1])) + p[0] = p[1] # # name test @@ -283,7 +285,7 @@ def p_function_call(p): """ #Hacking around the ambiguity between node type test & function call if p[1] in ('node', 'text'): - p[0] = ast.NodeType(p[1]) + p[0] = ast.NodeTypeTest(p[1]) else: p[0] = ast.FunctionCall(p[1], p[2]) diff --git a/pylib/uxml/xmliter.py b/pylib/uxml/xmliter.py index 4277731..6dea8eb 100644 --- a/pylib/uxml/xmliter.py +++ b/pylib/uxml/xmliter.py @@ -21,7 +21,7 @@ def buffer_handler(accumulator): class sender(treeiter.sender): ''' - >>> from amara3.uxml import xml + >>> from amara3.uxml import xmliter ... def sink(accumulator): ... while True: ... e = yield @@ -33,8 +33,8 @@ class sender(treeiter.sender): >>> values ['1', '2', '3'] ''' - def __init__(self, pattern, sink, callbacks=expat_callbacks): - super(sender, self).__init__(pattern, sink) + def __init__(self, pattern, sink, prime_sinks=True, callbacks=expat_callbacks): + super(sender, self).__init__(pattern, sink, prime_sinks=prime_sinks) self.handler = callbacks(self._handler()) self.expat_parser = xml.parsers.expat.ParserCreate(namespace_separator=' ') diff --git a/test/uxml/test_treegc.py b/test/uxml/test_treegc.py index f214013..71d3d8a 100644 --- a/test/uxml/test_treegc.py +++ b/test/uxml/test_treegc.py @@ -8,6 +8,7 @@ import pytest from amara3.uxml import tree +from amara3.uxml import treeiter DOC1 = '<a><b>1</b><b>2</b><b>3</b></a>' @@ -28,8 +29,9 @@ def sink(accumulator): old_e = None while True: e = yield - #No refcnt yet from accumulator, but 1 from parent & others from the treesequence code - assert sys.getrefcount(e) == 5 + #import pprint; pprint.pprint(gc.get_referrers(e)) + #Refs from parent & from frame objects + assert sys.getrefcount(e) == 6 #old_e is down to 2 refcounts, 1 from the old_e container & 1 from accumulator if old_e is not None: assert sys.getrefcount(old_e) == 2 accumulator.append(e.xml_value) @@ -37,7 +39,7 @@ def sink(accumulator): gc.collect() #Make sure circrefs have been GCed values = [] - ts = tree.treesequence(pat, sink(values)) + ts = treeiter.sender(pat, sink(values)) ts.parse(doc) assert values == expected