Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

Commit

Permalink
parser: Implement fragment parsing
Browse files Browse the repository at this point in the history
The HTML5 fragment parsing algorithm has been implemented using a new
API, `gumbo_parse_fragment`. The old APIs are maintained for backwards
compatibility, although passing `GUMBO_TAG_LAST` as the inner_html
context to `parse_fragment` will cause it to parse the buffer as a full
document (same functionality as `gumbo_parse_with_options`).

The HTML5lib adapter code has been modified to support fragment parsing
tests (the tests are passing 100%).
  • Loading branch information
vmg committed Feb 16, 2015
1 parent 900e904 commit a74d295
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 43 deletions.
23 changes: 16 additions & 7 deletions python/gumbo/gumboc.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ def to_url(self):


class Tag(Enum):
@staticmethod
def from_str(tagname):
text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
return _tag_enum(text_ptr)

_values_ = [
'HTML',
'HEAD',
Expand Down Expand Up @@ -398,6 +403,7 @@ class Tag(Enum):
'SPACER',
'TT',
'UNKNOWN',
'LAST'
]


Expand Down Expand Up @@ -498,11 +504,6 @@ def __repr__(self):

class Options(ctypes.Structure):
_fields_ = [
# TODO(jdtang): Allow the Python API to set the allocator/deallocator
# function. Right now these are treated as opaque void pointers.
('allocator', ctypes.c_void_p),
('deallocator', ctypes.c_void_p),
('userdata', ctypes.c_void_p),
('tab_stop', ctypes.c_int),
('stop_on_first_error', ctypes.c_bool),
('max_errors', ctypes.c_int),
Expand All @@ -517,10 +518,10 @@ class Output(ctypes.Structure):
('errors', Vector),
]


@contextlib.contextmanager
def parse(text, **kwargs):
options = Options()
container = kwargs.get("inner_html", Tag.LAST)
for field_name, _ in Options._fields_:
try:
setattr(options, field_name, kwargs[field_name])
Expand All @@ -531,7 +532,7 @@ def parse(text, **kwargs):
# call, it creates a temporary buffer which is destroyed when the call
# completes, and then the original_text pointers point into invalid memory.
text_ptr = ctypes.c_char_p(text.encode('utf-8'))
output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
output = _parse_fragment(ctypes.byref(options), text_ptr, len(text), container)
try:
yield output
finally:
Expand All @@ -543,6 +544,10 @@ def parse(text, **kwargs):
_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
_parse_with_options.restype = _Ptr(Output)

_parse_fragment = _dll.gumbo_parse_fragment
_parse_fragment.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag]
_parse_fragment.restype = _Ptr(Output)

_tag_from_original_text = _dll.gumbo_tag_from_original_text
_tag_from_original_text.argtypes = [_Ptr(StringPiece)]
_tag_from_original_text.restype = None
Expand All @@ -559,6 +564,10 @@ def parse(text, **kwargs):
_tagname.argtypes = [Tag]
_tagname.restype = ctypes.c_char_p

_tag_enum = _dll.gumbo_tag_enum
_tag_enum.argtypes = [ctypes.c_char_p]
_tag_enum.restype = Tag

__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',
Expand Down
22 changes: 19 additions & 3 deletions python/gumbo/html5lib_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ def _convert_element(source_node):
}


def _insert_root(treebuilder, source_node):
def _insert_root(treebuilder, source_node, pop_element = True):
treebuilder.insertRoot(_convert_element(source_node))
for child_node in source_node.children:
_insert_node(treebuilder, child_node)
treebuilder.openElements.pop()

if pop_element:
treebuilder.openElements.pop()

def _insert_node(treebuilder, source_node):
assert source_node.type != gumboc.NodeType.DOCUMENT
Expand Down Expand Up @@ -115,3 +115,19 @@ def parse(self, text_or_file, **kwargs):
else:
assert 'Only comments and <html> nodes allowed at the root'
return self.tree.getDocument()

def parseFragment(self, text_or_file, inner_html, **kwargs):
try:
text = text_or_file.read()
except AttributeError:
# Assume a string.
text = text_or_file
inner_html = gumboc.Tag.from_str(inner_html)

with gumboc.parse(text, inner_html=inner_html, **kwargs) as output:
for node in output.contents.document.contents.children:
if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
_insert_root(self.tree, output.contents.root.contents, False)
else:
assert 'Malformed fragment parse (??)'
return self.tree.getFragment()
12 changes: 3 additions & 9 deletions python/gumbo/html5lib_adapter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,10 @@ def impl(self, inner_html, input, expected, errors):
p = html5lib_adapter.HTMLParser(
tree=TREEBUILDER(namespaceHTMLElements=True))

if not inner_html:
# TODO(jdtang): Need to implement fragment parsing.
document = p.parse(StringIO.StringIO(input))
if inner_html:
document = p.parseFragment(StringIO.StringIO(input), inner_html)
else:
return
document = p.parse(StringIO.StringIO(input))

with warnings.catch_warnings():
# Etree serializer in html5lib uses a deprecated getchildren() API.
Expand All @@ -137,11 +136,6 @@ def impl(self, inner_html, input, expected, errors):
expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub(
r'\1<html \2>', convertExpected(expected, 2))

# html5lib doesn't yet support the template tag, but it appears in the
# tests with the expectation that the template contents will be under the
# word 'contents', so we need to reformat that string a bit.
expected = reformatTemplateContents(expected)

error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected,
'\nReceived:', output])
self.assertEquals(expected, output,
Expand Down
8 changes: 8 additions & 0 deletions src/gumbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,14 @@ GumboOutput* gumbo_parse(const char* buffer);
GumboOutput* gumbo_parse_with_options(
const GumboOptions* options, const char* buffer, size_t buffer_length);

/**
* Parse a chunk of HTML with the given fragment context. If `fragment_ctx`
* is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document.
*/
GumboOutput* gumbo_parse_fragment(
const GumboOptions* options, const char* buffer, size_t length,
const GumboTag fragment_ctx);

/** Release the memory used for the parse tree & parse errors. */
void gumbo_destroy_output(GumboOutput* output);

Expand Down
Loading

0 comments on commit a74d295

Please sign in to comment.