parser: Implement fragment parsing

The HTML5 fragment parsing algorithm has been implemented using a new API, `gumbo_parse_fragment`. The old APIs are maintained for backwards compatibility, although passing `GUMBO_TAG_LAST` as the inner_html context to `parse_fragment` will cause it to parse the buffer as a full document (same functionality as `gumbo_parse_with_options`). The HTML5lib adapter code has been modified to support fragment parsing tests (the tests are passing 100%).
google · Feb 16, 2015 · a74d295 · a74d295
1 parent 900e904
commit a74d295
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 43 deletions.
diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
@@ -246,6 +246,11 @@ def to_url(self):
 
 
 class Tag(Enum):
+  @staticmethod
+  def from_str(tagname):
+    text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
+    return _tag_enum(text_ptr)
+
   _values_ = [
       'HTML',
       'HEAD',
@@ -398,6 +403,7 @@ class Tag(Enum):
       'SPACER',
       'TT',
       'UNKNOWN',
+      'LAST'
       ]
 
 
@@ -498,11 +504,6 @@ def __repr__(self):
 
 class Options(ctypes.Structure):
   _fields_ = [
-      # TODO(jdtang): Allow the Python API to set the allocator/deallocator
-      # function.  Right now these are treated as opaque void pointers.
-      ('allocator', ctypes.c_void_p),
-      ('deallocator', ctypes.c_void_p),
-      ('userdata', ctypes.c_void_p),
       ('tab_stop', ctypes.c_int),
       ('stop_on_first_error', ctypes.c_bool),
       ('max_errors', ctypes.c_int),
@@ -517,10 +518,10 @@ class Output(ctypes.Structure):
       ('errors', Vector),
       ]
 
-
 @contextlib.contextmanager
 def parse(text, **kwargs):
   options = Options()
+  container = kwargs.get("inner_html", Tag.LAST)
   for field_name, _ in Options._fields_:
     try:
       setattr(options, field_name, kwargs[field_name])
@@ -531,7 +532,7 @@ def parse(text, **kwargs):
   # call, it creates a temporary buffer which is destroyed when the call
   # completes, and then the original_text pointers point into invalid memory.
   text_ptr = ctypes.c_char_p(text.encode('utf-8'))
-  output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
+  output = _parse_fragment(ctypes.byref(options), text_ptr, len(text), container)
   try:
     yield output
   finally:
@@ -543,6 +544,10 @@ def parse(text, **kwargs):
 _parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
 _parse_with_options.restype = _Ptr(Output)
 
+_parse_fragment = _dll.gumbo_parse_fragment
+_parse_fragment.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t, Tag]
+_parse_fragment.restype = _Ptr(Output)
+
 _tag_from_original_text = _dll.gumbo_tag_from_original_text
 _tag_from_original_text.argtypes = [_Ptr(StringPiece)]
 _tag_from_original_text.restype = None
@@ -559,6 +564,10 @@ def parse(text, **kwargs):
 _tagname.argtypes = [Tag]
 _tagname.restype = ctypes.c_char_p
 
+_tag_enum = _dll.gumbo_tag_enum
+_tag_enum.argtypes = [ctypes.c_char_p]
+_tag_enum.restype = Tag
+
 __all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
            'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
            'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',

diff --git a/python/gumbo/html5lib_adapter.py b/python/gumbo/html5lib_adapter.py
@@ -70,12 +70,12 @@ def _convert_element(source_node):
       }
 
 
-def _insert_root(treebuilder, source_node):
+def _insert_root(treebuilder, source_node, pop_element = True):
   treebuilder.insertRoot(_convert_element(source_node))
   for child_node in source_node.children:
     _insert_node(treebuilder, child_node)
-  treebuilder.openElements.pop()
-
+  if pop_element:
+    treebuilder.openElements.pop()
 
 def _insert_node(treebuilder, source_node):
   assert source_node.type != gumboc.NodeType.DOCUMENT
@@ -115,3 +115,19 @@ def parse(self, text_or_file, **kwargs):
         else:
           assert 'Only comments and <html> nodes allowed at the root'
       return self.tree.getDocument()
+
+  def parseFragment(self, text_or_file, inner_html, **kwargs):
+    try:
+      text = text_or_file.read()
+    except AttributeError:
+      # Assume a string.
+      text = text_or_file
+    inner_html = gumboc.Tag.from_str(inner_html)
+
+    with gumboc.parse(text, inner_html=inner_html, **kwargs) as output:
+      for node in output.contents.document.contents.children:
+        if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
+          _insert_root(self.tree, output.contents.root.contents, False)
+        else:
+          assert 'Malformed fragment parse (??)'
+      return self.tree.getFragment()
diff --git a/python/gumbo/html5lib_adapter_test.py b/python/gumbo/html5lib_adapter_test.py
@@ -123,11 +123,10 @@ def impl(self, inner_html, input, expected, errors):
     p = html5lib_adapter.HTMLParser(
             tree=TREEBUILDER(namespaceHTMLElements=True))
 
-    if not inner_html:
-      # TODO(jdtang): Need to implement fragment parsing.
-      document = p.parse(StringIO.StringIO(input))
+    if inner_html:
+      document = p.parseFragment(StringIO.StringIO(input), inner_html)
     else:
-      return
+      document = p.parse(StringIO.StringIO(input))
 
     with warnings.catch_warnings():
       # Etree serializer in html5lib uses a deprecated getchildren() API.
@@ -137,11 +136,6 @@ def impl(self, inner_html, input, expected, errors):
     expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub(
         r'\1<html \2>', convertExpected(expected, 2))
 
-    # html5lib doesn't yet support the template tag, but it appears in the
-    # tests with the expectation that the template contents will be under the
-    # word 'contents', so we need to reformat that string a bit.
-    expected = reformatTemplateContents(expected)
-
     error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected,
                            '\nReceived:', output])
     self.assertEquals(expected, output,

diff --git a/src/gumbo.h b/src/gumbo.h
@@ -791,6 +791,14 @@ GumboOutput* gumbo_parse(const char* buffer);
 GumboOutput* gumbo_parse_with_options(
     const GumboOptions* options, const char* buffer, size_t buffer_length);
 
+/**
+ * Parse a chunk of HTML with the given fragment context. If `fragment_ctx`
+ * is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document.
+ */
+GumboOutput* gumbo_parse_fragment(
+    const GumboOptions* options, const char* buffer, size_t length,
+    const GumboTag fragment_ctx);
+
 /** Release the memory used for the parse tree & parse errors. */
 void gumbo_destroy_output(GumboOutput* output);